diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py index 58fbbd08f994..21b6eebef5a1 100644 --- a/.circleci/cimodel/data/binary_build_data.py +++ b/.circleci/cimodel/data/binary_build_data.py @@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version): )), # Skip CUDA-9.2 builds on Windows windows=( - [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]], + [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS], OrderedDict( wheel=dimensions.STANDARD_PYTHON_VERSIONS, conda=dimensions.STANDARD_PYTHON_VERSIONS, @@ -142,11 +142,11 @@ def get_children(self): # XXX disabling conda rocm build since docker images are not there if self.find_prop("package_format") == 'conda': - gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions) + gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions) # XXX libtorch rocm build is temporarily disabled if self.find_prop("package_format") == 'libtorch': - gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions) + gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions) return [ArchConfigNode(self, v) for v in gpu_versions] diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py index 93d4d645a53a..1f83cd61b13c 100644 --- a/.circleci/cimodel/data/dimensions.py +++ b/.circleci/cimodel/data/dimensions.py @@ -9,9 +9,12 @@ ROCM_VERSIONS = [ "3.7", + "3.8", ] -GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS] +ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS] + +GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS STANDARD_PYTHON_VERSIONS = [ "3.6", diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py index d582348b00c8..ccd97a053516 100644 --- a/.circleci/cimodel/data/pytorch_build_definitions.py +++ b/.circleci/cimodel/data/pytorch_build_definitions.py @@ -6,7 +6,7 @@ import cimodel.lib.conf_tree as conf_tree import cimodel.lib.miniutils as miniutils from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode -from cimodel.data.simple.util.branch_filters import gen_filter_dict +from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN from cimodel.data.simple.util.docker_constants import gen_docker_image @@ -110,6 +110,8 @@ def gen_workflow_params(self, phase): parameters["resource_class"] = resource_class if phase == "build" and self.rocm_version is not None: parameters["resource_class"] = "xlarge" + if hasattr(self, 'filters'): + parameters['filters'] = self.filters return parameters def gen_workflow_job(self, phase): @@ -139,14 +141,16 @@ def gen_workflow_job(self, phase): # TODO This is a hack to special case some configs just for the workflow list class HiddenConf(object): - def __init__(self, name, parent_build=None): + def __init__(self, name, parent_build=None, filters=None): self.name = name self.parent_build = parent_build + self.filters = filters def gen_workflow_job(self, phase): return { self.gen_build_name(phase): { - "requires": [self.parent_build.gen_build_name("build")] + "requires": [self.parent_build.gen_build_name("build")], + "filters": self.filters, } } @@ -166,7 +170,8 @@ def gen_workflow_job(self, phase): "branch": self.branch, "requires": [self.parent_build], "context": "org-member", - "filters": gen_filter_dict(branches_list=["nightly"]) + "filters": gen_filter_dict(branches_list=["nightly"], + tags_list=RC_PATTERN) } } @@ -205,7 +210,9 @@ def gen_docs_configs(xenial_parent_config): configs.append( HiddenConf( "pytorch_python_doc_build", - parent_build=xenial_parent_config + parent_build=xenial_parent_config, + filters=gen_filter_dict(branches_list=r"/.*/", + tags_list=RC_PATTERN), ) ) configs.append( @@ -219,7 +226,9 @@ def gen_docs_configs(xenial_parent_config): configs.append( HiddenConf( "pytorch_cpp_doc_build", - parent_build=xenial_parent_config + parent_build=xenial_parent_config, + filters=gen_filter_dict(branches_list=r"/.*/", + tags_list=RC_PATTERN), ) ) configs.append( @@ -348,6 +357,8 @@ def instantiate_configs(): # run docs builds on "pytorch-linux-xenial-py3.6-gcc5.4". Docs builds # should run on a CPU-only build that runs on all PRs. + # XXX should this be updated to a more modern build? Projects are + # beginning to drop python3.6 if ( distro_name == "xenial" and fc.find_prop("pyver") == "3.6" @@ -358,6 +369,8 @@ def instantiate_configs(): and compiler_name == "gcc" and fc.find_prop("compiler_version") == "5.4" ): + c.filters = gen_filter_dict(branches_list=r"/.*/", + tags_list=RC_PATTERN) c.dependent_tests = gen_docs_configs(c) if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch: diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py index 59944d190383..2b3add33b9a8 100644 --- a/.circleci/cimodel/data/simple/docker_definitions.py +++ b/.circleci/cimodel/data/simple/docker_definitions.py @@ -1,6 +1,7 @@ from collections import OrderedDict from cimodel.lib.miniutils import quote +from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN # TODO: make this generated from a matrix rather than just a static list @@ -24,25 +25,30 @@ "pytorch-linux-xenial-py3.8", "pytorch-linux-xenial-py3.6-clang7", "pytorch-linux-xenial-py3.6-gcc4.8", - "pytorch-linux-xenial-py3.6-gcc5.4", + "pytorch-linux-xenial-py3.6-gcc5.4", # this one is used in doc builds "pytorch-linux-xenial-py3.6-gcc7.2", "pytorch-linux-xenial-py3.6-gcc7", "pytorch-linux-bionic-rocm3.7-py3.6", + "pytorch-linux-bionic-rocm3.8-py3.6", ] def get_workflow_jobs(): """Generates a list of docker image build definitions""" - return [ - OrderedDict( + ret = [] + for image_name in IMAGE_NAMES: + parameters = OrderedDict({ + "name": quote(f"docker-{image_name}"), + "image_name": quote(image_name), + }) + if image_name == "pytorch-linux-xenial-py3.6-gcc5.4": + # pushing documentation on tags requires CircleCI to also + # build all the dependencies on tags, including this docker image + parameters['filters'] = gen_filter_dict(branches_list=r"/.*/", + tags_list=RC_PATTERN) + ret.append(OrderedDict( { - "docker_build_job": OrderedDict( - { - "name": quote(f"docker-{image_name}"), - "image_name": quote(image_name), - } - ) + "docker_build_job": parameters } - ) - for image_name in IMAGE_NAMES - ] + )) + return ret diff --git a/.circleci/cimodel/data/simple/ge_config_tests.py b/.circleci/cimodel/data/simple/ge_config_tests.py index 2f2dbf0027dc..235c08d62786 100644 --- a/.circleci/cimodel/data/simple/ge_config_tests.py +++ b/.circleci/cimodel/data/simple/ge_config_tests.py @@ -61,41 +61,25 @@ def gen_tree(self): MultiPartVersion([3, 6], "py"), MultiPartVersion([5, 4], "gcc"), None, - ["ge_config_legacy", "test"], + ["jit_legacy", "test"], ["pytorch_linux_xenial_py3_6_gcc5_4_build"]), GeConfigTestJob( MultiPartVersion([3, 6], "py"), MultiPartVersion([5, 4], "gcc"), None, - ["ge_config_profiling", "test"], - ["pytorch_linux_xenial_py3_6_gcc5_4_build"]), - GeConfigTestJob( - MultiPartVersion([3, 6], "py"), - MultiPartVersion([5, 4], "gcc"), - None, - ["ge_config_simple", "test"], + ["jit_simple", "test"], ["pytorch_linux_xenial_py3_6_gcc5_4_build"], ), GeConfigTestJob( None, None, CudaVersion(10, 2), - ["cudnn7", "py3", "ge_config_legacy", "test"], - ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"], - use_cuda_docker=True, - # TODO Why does the build environment specify cuda10.1, while the - # job name is cuda10_2? - build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"), - GeConfigTestJob( - None, - None, - CudaVersion(10, 2), - ["cudnn7", "py3", "ge_config_profiling", "test"], + ["cudnn7", "py3", "jit_legacy", "test"], ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"], use_cuda_docker=True, # TODO Why does the build environment specify cuda10.1, while the # job name is cuda10_2? - build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test"), + build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test"), ] diff --git a/.circleci/cimodel/data/simple/ios_definitions.py b/.circleci/cimodel/data/simple/ios_definitions.py index 4446fa24fc28..3473242bdf04 100644 --- a/.circleci/cimodel/data/simple/ios_definitions.py +++ b/.circleci/cimodel/data/simple/ios_definitions.py @@ -1,7 +1,7 @@ from cimodel.data.simple.util.versions import MultiPartVersion -IOS_VERSION = MultiPartVersion([11, 2, 1]) +IOS_VERSION = MultiPartVersion([12, 0, 0]) class ArchVariant: @@ -62,8 +62,8 @@ def gen_tree(self): WORKFLOW_DATA = [ IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False), - # IOSJob(IOS_VERSION, ArchVariant("arm64")), - # IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}), + IOSJob(IOS_VERSION, ArchVariant("arm64")), + IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}), ] diff --git a/.circleci/config.yml b/.circleci/config.yml index b32bb9b5086a..208e0d09eed0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -640,6 +640,7 @@ jobs: export CIRCLE_SHA1="$CIRCLE_SHA1" export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}" export CIRCLE_BRANCH="$CIRCLE_BRANCH" + export CIRCLE_JOB="$CIRCLE_JOB" cd workspace python test/print_test_stats.py test EOL @@ -924,7 +925,7 @@ jobs: smoke_mac_test: <<: *binary_linux_test_upload_params macos: - xcode: "9.4.1" + xcode: "12.0" steps: - checkout - run: @@ -949,7 +950,7 @@ jobs: binary_mac_build: <<: *binary_mac_params macos: - xcode: "9.4.1" + xcode: "12.0" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout @@ -990,7 +991,7 @@ jobs: binary_ios_build: <<: *pytorch_ios_params macos: - xcode: "11.2.1" + xcode: "12.0" steps: - attach_workspace: at: ~/workspace @@ -1017,7 +1018,7 @@ jobs: binary_ios_upload: <<: *pytorch_ios_params macos: - xcode: "11.2.1" + xcode: "12.0" steps: - attach_workspace: at: ~/workspace @@ -1187,10 +1188,13 @@ jobs: set -ex export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1} echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} + tag=${CIRCLE_TAG:1:5} + target=${tag:-master} + echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) - export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1' + export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts @@ -1229,10 +1233,13 @@ jobs: set -ex export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1} echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} + tag=${CIRCLE_TAG:1:5} + target=${tag:-master} + echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) - export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1' + export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts @@ -1253,7 +1260,7 @@ jobs: environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build macos: - xcode: "9.4.1" + xcode: "12.0" steps: - checkout - run_brew_for_macos_build @@ -1287,7 +1294,7 @@ jobs: environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test macos: - xcode: "9.4.1" + xcode: "12.0" steps: - checkout - attach_workspace: @@ -1515,7 +1522,7 @@ jobs: pytorch_ios_build: <<: *pytorch_ios_params macos: - xcode: "11.2.1" + xcode: "12.0" steps: - checkout - run_brew_for_ios_build @@ -1534,7 +1541,7 @@ jobs: rm cert.txt bundle exec fastlane install_cert # install the provisioning profile - PROFILE=TestApp_CI.mobileprovision + PROFILE=PyTorch_CI_2021.mobileprovision PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles mkdir -pv "${PROVISIONING_PROFILES}" cd "${PROVISIONING_PROFILES}" @@ -1592,7 +1599,7 @@ jobs: command: | set -e PROJ_ROOT=/Users/distiller/project - PROFILE=TestApp_CI + PROFILE=PyTorch_CI_2021 # run the ruby build script if ! [ -x "$(command -v xcodebuild)" ]; then echo 'Error: xcodebuild is not installed.' @@ -2130,6 +2137,39 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-rocm:3.7" + - binary_linux_build: + name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.6m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" + - binary_linux_build: + name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.7m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" + - binary_linux_build: + name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.8m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" - binary_linux_build: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build build_environment: "conda 3.6 cpu devtoolset7" @@ -3429,6 +3469,51 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.6m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.7m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.8m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test build_environment: "conda 3.6 cpu devtoolset7" @@ -4932,6 +5017,48 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: rocm3.7 + - binary_upload: + name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 + - binary_upload: + name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 + - binary_upload: + name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 - binary_upload: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload context: org-member @@ -6311,6 +6438,11 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-xenial-py3.6-gcc5.4" image_name: "pytorch-linux-xenial-py3.6-gcc5.4" + filters: + branches: + only: /.*/ + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - docker_build_job: name: "docker-pytorch-linux-xenial-py3.6-gcc7.2" image_name: "pytorch-linux-xenial-py3.6-gcc7.2" @@ -6320,12 +6452,20 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-bionic-rocm3.7-py3.6" image_name: "pytorch-linux-bionic-rocm3.7-py3.6" + - docker_build_job: + name: "docker-pytorch-linux-bionic-rocm3.8-py3.6" + image_name: "pytorch-linux-bionic-rocm3.8-py3.6" - pytorch_linux_build: name: pytorch_linux_xenial_py3_6_gcc5_4_build requires: - "docker-pytorch-linux-xenial-py3.6-gcc5.4" build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" + filters: + branches: + only: /.*/ + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - pytorch_linux_test: name: pytorch_linux_xenial_py3_6_gcc5_4_test requires: @@ -6333,7 +6473,17 @@ workflows: build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-test" docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4" resource_class: large + filters: + branches: + only: /.*/ + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ - pytorch_python_doc_build: + filters: + branches: + only: /.*/ + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ requires: - pytorch_linux_xenial_py3_6_gcc5_4_build - pytorch_doc_push: @@ -6343,10 +6493,17 @@ workflows: branches: only: - nightly + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ name: pytorch_python_doc_push requires: - pytorch_python_doc_build - pytorch_cpp_doc_build: + filters: + branches: + only: /.*/ + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ requires: - pytorch_linux_xenial_py3_6_gcc5_4_build - pytorch_doc_push: @@ -6356,6 +6513,8 @@ workflows: branches: only: - nightly + tags: + only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ name: pytorch_cpp_doc_push requires: - pytorch_cpp_doc_build @@ -6819,10 +6978,23 @@ workflows: - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build - pytorch_ios_build: - build_environment: pytorch-ios-11.2.1-x86_64_build + build_environment: pytorch-ios-12.0.0-x86_64_build ios_arch: x86_64 ios_platform: SIMULATOR - name: pytorch_ios_11_2_1_x86_64_build + name: pytorch_ios_12_0_0_x86_64_build + - pytorch_ios_build: + build_environment: pytorch-ios-12.0.0-arm64_build + context: org-member + ios_arch: arm64 + ios_platform: OS + name: pytorch_ios_12_0_0_arm64_build + - pytorch_ios_build: + build_environment: pytorch-ios-12.0.0-arm64_custom_build + context: org-member + ios_arch: arm64 + ios_platform: OS + name: pytorch_ios_12_0_0_arm64_custom_build + op_list: mobilenetv2.yaml - pytorch_linux_build: build_environment: pytorch-linux-xenial-py3-clang5-mobile-build build_only: "1" @@ -6851,38 +7023,23 @@ workflows: requires: - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c - pytorch_linux_test: - build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test + build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_legacy-test docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test + name: pytorch_linux_xenial_py3_6_gcc5_4_jit_legacy_test requires: - pytorch_linux_xenial_py3_6_gcc5_4_build resource_class: large - pytorch_linux_test: - build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test + build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_simple-test docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test + name: pytorch_linux_xenial_py3_6_gcc5_4_jit_simple_test requires: - pytorch_linux_xenial_py3_6_gcc5_4_build resource_class: large - pytorch_linux_test: - build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test - docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test - requires: - - pytorch_linux_xenial_py3_6_gcc5_4_build - resource_class: large - - pytorch_linux_test: - build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test + build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 - name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test - requires: - - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build - resource_class: gpu.medium - use_cuda_docker_runtime: "1" - - pytorch_linux_test: - build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test - docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 - name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test + name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_jit_legacy_test requires: - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build resource_class: gpu.medium @@ -7004,32 +7161,32 @@ workflows: requires: - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build - binary_ios_build: - build_environment: libtorch-ios-11.2.1-nightly-x86_64-build + build_environment: libtorch-ios-12.0.0-nightly-x86_64-build context: org-member filters: branches: only: nightly ios_arch: x86_64 ios_platform: SIMULATOR - name: pytorch_ios_11_2_1_nightly_x86_64_build + name: pytorch_ios_12_0_0_nightly_x86_64_build - binary_ios_build: - build_environment: libtorch-ios-11.2.1-nightly-arm64-build + build_environment: libtorch-ios-12.0.0-nightly-arm64-build context: org-member filters: branches: only: nightly ios_arch: arm64 ios_platform: OS - name: pytorch_ios_11_2_1_nightly_arm64_build + name: pytorch_ios_12_0_0_nightly_arm64_build - binary_ios_upload: - build_environment: libtorch-ios-11.2.1-nightly-binary-build-upload + build_environment: libtorch-ios-12.0.0-nightly-binary-build-upload context: org-member filters: branches: only: nightly requires: - - pytorch_ios_11_2_1_nightly_x86_64_build - - pytorch_ios_11_2_1_nightly_arm64_build + - pytorch_ios_12_0_0_nightly_x86_64_build + - pytorch_ios_12_0_0_nightly_arm64_build - pytorch_linux_build: build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32 docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c @@ -7464,6 +7621,42 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.7" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.6m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.7m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.8m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly build_environment: "conda 3.6 cpu devtoolset7" diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 9bfa0b195499..0afc1b33c59e 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -262,6 +262,13 @@ case "$image" in VISION=yes ROCM_VERSION=3.7 ;; + pytorch-linux-bionic-rocm3.8-py3.6) + ANACONDA_PYTHON_VERSION=3.6 + PROTOBUF=yes + DB=yes + VISION=yes + ROCM_VERSION=3.8 + ;; *) # Catch-all for builds that are not hardcoded. PROTOBUF=yes diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh index ac4e1f18f1ef..1fc49932fee5 100755 --- a/.circleci/docker/common/install_base.sh +++ b/.circleci/docker/common/install_base.sh @@ -118,7 +118,7 @@ esac # Install Valgrind separately since the apt-get version is too old. mkdir valgrind_build && cd valgrind_build -VALGRIND_VERSION=3.15.0 +VALGRIND_VERSION=3.16.1 if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2 then wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2 @@ -131,4 +131,3 @@ sudo make install cd ../../ rm -rf valgrind_build alias valgrind="/usr/local/bin/valgrind" - diff --git a/.circleci/docker/common/install_cache.sh b/.circleci/docker/common/install_cache.sh index f1066519cd70..17931375b6f0 100644 --- a/.circleci/docker/common/install_cache.sh +++ b/.circleci/docker/common/install_cache.sh @@ -16,7 +16,7 @@ fi chmod a+x /opt/cache/bin/sccache function write_sccache_stub() { - printf "#!/bin/sh\nexec sccache $(which $1) \$*" > "/opt/cache/bin/$1" + printf "#!/bin/sh\nexec sccache $(which $1) \"\$@\"" > "/opt/cache/bin/$1" chmod a+x "/opt/cache/bin/$1" } @@ -57,8 +57,8 @@ if [ -n "$ROCM_VERSION" ]; then TOPDIR=$(dirname $OLDCOMP) WRAPPED="$TOPDIR/original/$COMPNAME" mv "$OLDCOMP" "$WRAPPED" - printf "#!/bin/sh\nexec sccache $WRAPPED \$*" > "$OLDCOMP" - chmod a+x "$1" + printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" > "$OLDCOMP" + chmod a+x "$OLDCOMP" } if [[ -e "/opt/rocm/hcc/bin/hcc" ]]; then diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh index efab1e5ded3a..1166b3a1bab7 100644 --- a/.circleci/scripts/binary_ios_build.sh +++ b/.circleci/scripts/binary_ios_build.sh @@ -16,6 +16,7 @@ source ~/anaconda/bin/activate # Install dependencies conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes +conda install -c conda-forge valgrind --yes export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} # sync submodules diff --git a/.circleci/scripts/binary_ios_test.sh b/.circleci/scripts/binary_ios_test.sh index be281120016a..863b21724a5d 100644 --- a/.circleci/scripts/binary_ios_test.sh +++ b/.circleci/scripts/binary_ios_test.sh @@ -13,7 +13,7 @@ base64 --decode cert.txt -o Certificates.p12 rm cert.txt bundle exec fastlane install_cert # install the provisioning profile -PROFILE=TestApp_CI.mobileprovision +PROFILE=PyTorch_CI_2021.mobileprovision PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles mkdir -pv "${PROVISIONING_PROFILES}" cd "${PROVISIONING_PROFILES}" @@ -25,5 +25,5 @@ if ! [ -x "$(command -v xcodebuild)" ]; then echo 'Error: xcodebuild is not installed.' exit 1 fi -PROFILE=TestApp_CI +PROFILE=PyTorch_CI_2021 ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM} -c ${PROFILE} -t ${IOS_DEV_TEAM_ID} diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index d4c31cefc7e5..ec7651823536 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -73,7 +73,7 @@ PIP_UPLOAD_FOLDER='nightly/' # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it export DATE="$(date -u +%Y%m%d)" #TODO: We should be pulling semver version from the base version.txt -BASE_BUILD_VERSION="1.7.0.dev$DATE" +BASE_BUILD_VERSION="1.8.0.dev$DATE" # Change BASE_BUILD_VERSION to git tag when on a git tag # Use 'git -C' to make doubly sure we're in the correct directory for checking # the git tag @@ -130,7 +130,7 @@ if [[ "${BUILD_FOR_SYSTEM:-}" == "windows" ]]; then fi export DATE="$DATE" -export NIGHTLIES_DATE_PREAMBLE=1.7.0.dev +export NIGHTLIES_DATE_PREAMBLE=1.8.0.dev export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION" export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER" export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION" diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml index bd26e8b2b373..489dfefdbff1 100644 --- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml @@ -135,7 +135,7 @@ smoke_mac_test: <<: *binary_linux_test_upload_params macos: - xcode: "9.4.1" + xcode: "12.0" steps: - checkout - run: @@ -160,7 +160,7 @@ binary_mac_build: <<: *binary_mac_params macos: - xcode: "9.4.1" + xcode: "12.0" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout @@ -201,7 +201,7 @@ binary_ios_build: <<: *pytorch_ios_params macos: - xcode: "11.2.1" + xcode: "12.0" steps: - attach_workspace: at: ~/workspace @@ -228,7 +228,7 @@ binary_ios_upload: <<: *pytorch_ios_params macos: - xcode: "11.2.1" + xcode: "12.0" steps: - attach_workspace: at: ~/workspace diff --git a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml index 810f16922d5c..5c7c9bf0462c 100644 --- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml +++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml @@ -43,10 +43,13 @@ set -ex export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1} echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} + tag=${CIRCLE_TAG:1:5} + target=${tag:-master} + echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) - export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/master master site") | docker exec -u jenkins -i "$id" bash) 2>&1' + export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/python_doc_push_script.sh docs/'$target' master site") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts @@ -85,10 +88,13 @@ set -ex export COMMIT_DOCKER_IMAGE=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1} echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} + tag=${CIRCLE_TAG:1:5} + target=${tag:-master} + echo "building for ${target}" time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) - export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/master master") | docker exec -u jenkins -i "$id" bash) 2>&1' + export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && . ./.circleci/scripts/cpp_doc_push_script.sh docs/"$target" master") | docker exec -u jenkins -i "$id" bash) 2>&1' echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts @@ -109,7 +115,7 @@ environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build macos: - xcode: "9.4.1" + xcode: "12.0" steps: - checkout - run_brew_for_macos_build @@ -143,7 +149,7 @@ environment: BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test macos: - xcode: "9.4.1" + xcode: "12.0" steps: - checkout - attach_workspace: @@ -371,7 +377,7 @@ pytorch_ios_build: <<: *pytorch_ios_params macos: - xcode: "11.2.1" + xcode: "12.0" steps: - checkout - run_brew_for_ios_build @@ -390,7 +396,7 @@ rm cert.txt bundle exec fastlane install_cert # install the provisioning profile - PROFILE=TestApp_CI.mobileprovision + PROFILE=PyTorch_CI_2021.mobileprovision PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles mkdir -pv "${PROVISIONING_PROFILES}" cd "${PROVISIONING_PROFILES}" @@ -448,7 +454,7 @@ command: | set -e PROJ_ROOT=/Users/distiller/project - PROFILE=TestApp_CI + PROFILE=PyTorch_CI_2021 # run the ruby build script if ! [ -x "$(command -v xcodebuild)" ]; then echo 'Error: xcodebuild is not installed.' diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index 0f0dd76636b4..3bc7e5855a41 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -206,6 +206,7 @@ jobs: export CIRCLE_SHA1="$CIRCLE_SHA1" export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}" export CIRCLE_BRANCH="$CIRCLE_BRANCH" + export CIRCLE_JOB="$CIRCLE_JOB" cd workspace python test/print_test_stats.py test EOL diff --git a/.github/workflows/jit_triage.yml b/.github/workflows/jit_triage.yml index af59d2160ec6..1fb967e8ffb8 100644 --- a/.github/workflows/jit_triage.yml +++ b/.github/workflows/jit_triage.yml @@ -19,7 +19,7 @@ jobs: // - io: A reference to the @actions/io package // Check if issue has a JIT label. - const kJitLabel = "jit"; + const kJitLabel = "oncall: jit"; issue = await github.issues.get({ owner: context.issue.owner, diff --git a/.github/workflows/quantization_triage.yml b/.github/workflows/quantization_triage.yml new file mode 100644 index 000000000000..ac337a066873 --- /dev/null +++ b/.github/workflows/quantization_triage.yml @@ -0,0 +1,78 @@ +name: quantization-triage + +on: + issues: + types: [labeled] + +jobs: + welcome: + runs-on: ubuntu-latest + steps: + - uses: actions/github-script@v2 + with: + github-token: ${{secrets.GITHUB_TOKEN}} + script: | + // Arguments available: + // - github: A pre-authenticated octokit/rest.js client + // - context: An object containing the context of the workflow run + // - core: A reference to the @actions/core package + // - io: A reference to the @actions/io package + + // Check if issue has a Quantization label. + const kQuantizationLabel = "oncall: quantization"; + + issue = await github.issues.get({ + owner: context.issue.owner, + repo: context.issue.repo, + issue_number: context.issue.number, + }) + + const hasQuantizationLabel = issue.data.labels.filter(label => label.name == kQuantizationLabel).length > 0; + + if (!hasQuantizationLabel) { + core.debug("Issue " + issue.data.title + " does not have Quantization label"); + return; + } + + // Get project column ID. + const kProjectName = "Quantization Triage"; + const kColumnName = "Need Triage"; + + // Query all projects in the repository. + // TODO: Support pagination once there are > 30 projects. + const projects = await github.projects.listForRepo({ + owner: context.issue.owner, + repo: context.issue.repo, + }); + + // Filter out unwanted projects and get the ID for the Quantization Triage project. + const filteredProjects = projects.data.filter(project => project.name == kProjectName); + + if (filteredProjects.length != 1) { + core.setFailed("Unable to find a project named " + kProjectName); + return; + } + + const projectId = filteredProjects[0].id; + // First, query all columns in the project. + // TODO: Support pagination once there are > 30 columns. + const columns = await github.projects.listColumns({ + project_id: projectId, + }); + + // Filter out unwanted projects and get the ID for the Need triage column. + const filteredColumns = columns.data.filter(column => column.name == kColumnName); + + if (filteredColumns.length != 1) { + core.setFailed("Unable to find a column named " + kColumnName); + return; + } + + const columnId = filteredColumns[0].id; + + // Create a project card for this new issue. + await github.projects.createCard({ + column_id: columnId, + content_id: issue.data.id, + content_type: "Issue", + }) diff --git a/.gitmodules b/.gitmodules index 509ab94f1cf4..d7a11cc22996 100644 --- a/.gitmodules +++ b/.gitmodules @@ -130,3 +130,7 @@ ignore = dirty path = third_party/tensorpipe url = https://github.com/pytorch/tensorpipe.git +[submodule "third_party/valgrind"] + ignore = dirty + path = third_party/valgrind + url = https://sourceware.org/git/valgrind.git diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 20a7310a91c1..58b3979f7829 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -163,7 +163,7 @@ pip install --user pytest-sugar if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # Check out torch/vision at Jun 11 2020 commit # This hash must match one in .jenkins/pytorch/test.sh - pip install -q --user git+https://github.com/pytorch/vision.git@c2e8a00885e68ae1200eb6440f540e181d9125de + pip install -q --user git+https://github.com/pytorch/vision.git@e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb pip install -q --user ninja # JIT C++ extensions require ninja, so put it into PATH. export PATH="/var/lib/jenkins/.local/bin:$PATH" @@ -171,7 +171,7 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # default pip version is too old(9.0.2), unable to support tag `manylinux2010`. # Fix the pip error: Couldn't find a version that satisfies the requirement pip install --upgrade pip - pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.4.0.dev202008122 + pip install -q --user -i https://test.pypi.org/simple/ ort-nightly==1.5.0.dev202009182 fi "$ROOT_DIR/scripts/onnx/test.sh" fi diff --git a/.jenkins/pytorch/common_utils.sh b/.jenkins/pytorch/common_utils.sh index 682dd29b4cff..24d6f5676f7d 100644 --- a/.jenkins/pytorch/common_utils.sh +++ b/.jenkins/pytorch/common_utils.sh @@ -66,7 +66,7 @@ function get_bazel() { chmod +x tools/bazel } -TORCHVISION_COMMIT=c2e8a00885e68ae1200eb6440f540e181d9125de +TORCHVISION_COMMIT=e70c91a9ff9b8a20e05c133aec6ec3ed538c32fb function install_torchvision() { # Check out torch/vision at Jun 11 2020 commit diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 213750ba7280..8e71738f414e 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -63,7 +63,7 @@ test_python_all() { # Increase default limit on open file handles from 256 to 1024 ulimit -n 1024 - python test/run_test.py --verbose --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --determine-from="$DETERMINE_FROM" + python test/run_test.py --verbose --exclude-jit-executor --determine-from="$DETERMINE_FROM" assert_git_not_dirty } diff --git a/.jenkins/pytorch/print_sccache_log.py b/.jenkins/pytorch/print_sccache_log.py index c91472876c33..81c7e0752328 100644 --- a/.jenkins/pytorch/print_sccache_log.py +++ b/.jenkins/pytorch/print_sccache_log.py @@ -6,6 +6,7 @@ lines = f.readlines() for line in lines: - # Ignore errors from CPU instruction set testing - if 'src.c' not in line: + # Ignore errors from CPU instruction set or symbol existing testing + keywords = ['src.c', 'CheckSymbolExists.c'] + if all([keyword not in line for keyword in keywords]): print(line) diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 7e85039a72d1..0e35364a2f5d 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -126,23 +126,18 @@ if ([ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]); file_diff_from_base "$DETERMINE_FROM" fi -test_python_nn() { - time python test/run_test.py --include test_nn --verbose --determine-from="$DETERMINE_FROM" - assert_git_not_dirty -} - -test_python_ge_config_profiling() { - time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM" +test_python_legacy_jit() { + time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } -test_python_ge_config_legacy() { - time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM" +test_python_shard1() { + time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } -test_python_all_except_nn_and_cpp_extensions() { - time python test/run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM" +test_python_shard2() { + time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } @@ -304,7 +299,7 @@ test_xla() { assert_git_not_dirty } -# Do NOT run this test before any other tests, like test_python_nn, etc. +# Do NOT run this test before any other tests, like test_python_shard1, etc. # Because this function uninstalls the torch built from branch, and install # nightly version. test_backward_compatibility() { @@ -338,6 +333,8 @@ test_benchmarks() { pip_install --user "requests" BENCHMARK_DATA="benchmarks/.data" mkdir -p ${BENCHMARK_DATA} + pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default + python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_default.json pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_legacy_old.json --fuser=old --executor=legacy python benchmarks/upload_scribe.py --pytest_bench_json ${BENCHMARK_DATA}/fastrnns_legacy_old.json pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_profiling_te.json --fuser=te --executor=profiling @@ -379,19 +376,17 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then install_torchvision test_xla -elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then - test_python_ge_config_legacy -elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then - test_python_ge_config_profiling +elif [[ "${BUILD_ENVIRONMENT}" == *legacy_jit* || "${JOB_BASE_NAME}" == *legacy_jit* ]]; then + test_python_legacy_jit elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then - test_python_nn - test_cpp_extensions + install_torchvision + test_python_shard1 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then install_torchvision - test_python_all_except_nn_and_cpp_extensions + test_python_shard2 test_aten test_libtorch test_custom_script_ops @@ -407,9 +402,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4 test_cpp_extensions else install_torchvision - test_python_nn - test_python_all_except_nn_and_cpp_extensions - test_cpp_extensions + test_python_shard1 + test_python_shard2 test_aten test_vec256 test_libtorch diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat index a66ef4b651c5..cf7255ce3789 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_miniconda3.bat @@ -12,4 +12,11 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic if "%REBUILD%"=="" ( call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3 call conda install -y -q -c conda-forge cmake + call conda install -y -q -c rdonnelly libuv ) + +:: Get installed libuv path +@echo off +set libuv_ROOT=%CONDA_PARENT_DIR%\Miniconda3\Library +@echo on +echo libuv_ROOT=%libuv_ROOT% diff --git a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat index 4bfb5bc85e66..d76637dd0db7 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat @@ -1,3 +1,3 @@ call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat -cd test && python run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd .. +cd test && python run_test.py --exclude-jit-executor --verbose --determine-from="%1" && cd .. if ERRORLEVEL 1 exit /b 1 diff --git a/.jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat similarity index 51% rename from .jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat rename to .jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat index e437833d8c62..a9168644f471 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_jit_profiling.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_jit_legacy.bat @@ -3,9 +3,7 @@ call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat pushd test echo Run jit_profiling tests -python run_test.py --include test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" +python run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="%1" if ERRORLEVEL 1 exit /b 1 popd - - diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh index 0b0159d04a50..abcd5756d747 100755 --- a/.jenkins/pytorch/win-test.sh +++ b/.jenkins/pytorch/win-test.sh @@ -48,7 +48,7 @@ run_tests() { $SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \ $SCRIPT_HELPERS_DIR/test_libtorch.bat if [[ "${USE_CUDA}" == "1" ]]; then - $SCRIPT_HELPERS_DIR/test_python_jit_profiling.bat "$DETERMINE_FROM" + $SCRIPT_HELPERS_DIR/test_python_jit_legacy.bat "$DETERMINE_FROM" fi elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then $SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \ diff --git a/BUILD.bazel b/BUILD.bazel index 016863ff0958..a8ea7988a242 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -55,6 +55,7 @@ cc_library( "c10/cuda/*.h", "c10/cuda/impl/*.h", "c10/macros/*.h", + "c10/mobile/*.h", "c10/util/*.h", "c10/util/*.hpp", ]), @@ -71,6 +72,7 @@ cc_library( srcs = glob([ "c10/core/*.cpp", "c10/core/impl/*.cpp", + "c10/mobile/*.cpp", "c10/util/*.cpp", ]) + if_cuda( glob([ @@ -721,6 +723,7 @@ torch_cuda_half_options = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 826c187b602e..0d1225ab450e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,7 +103,7 @@ endif() # For non-supported platforms, turn USE_DISTRIBUTED off by default. # It is not tested and likely won't work without additional changes. -if(NOT LINUX) +if(NOT LINUX AND NOT WIN32) set(USE_DISTRIBUTED OFF CACHE STRING "Use distributed") # On macOS, if USE_DISTRIBUTED is enabled (specified by the user), # then make Gloo build with the libuv transport. @@ -226,6 +226,32 @@ option(USE_TBB "Use TBB" OFF) option(ONNX_ML "Enable traditional ONNX ML API." ON) option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF) +# Since TensorPipe does not support Windows, set it to OFF when WIN32 detected +# On Windows platform, if user does not install libuv in build conda env and +# does not set libuv_ROOT environment variable. Set USE_DISTRIBUTED to OFF. +if(WIN32) + set(USE_TENSORPIPE OFF) + message(WARNING "TensorPipe cannot be used on Windows. Set it to OFF") + + if(USE_DISTRIBUTED AND NOT DEFINED ENV{libuv_ROOT}) + find_library( + libuv_tmp_LIBRARY + NAMES uv libuv + HINTS $ENV{CONDA_PREFIX}\\Library + PATH_SUFFIXES lib + REQUIRED + NO_DEFAULT_PATH) + if(NOT EXISTS ${libuv_tmp_LIBRARY}) + set(USE_DISTRIBUTED OFF) + set(USE_GLOO OFF) + message( + WARNING "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF.") + else() + set(ENV{libuv_ROOT} $ENV{CONDA_PREFIX}\\Library) + endif() + endif() +endif() + # Linux distributions do not want too many embedded sources, in that sense we # need to be able to build pytorch with an (almost) empty third_party # directory. diff --git a/CODEOWNERS b/CODEOWNERS index 77b8d2cbcb36..42aa83bb61bf 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -23,9 +23,9 @@ # Distributed package # This list is mostly if you'd like to be tagged as reviewer, feel free to add # or remove yourself from it. -/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma -/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma -/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma +/torch/lib/c10d/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 +/torch/csrc/distributed/ @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 +/torch/distributed/ @apaszke @pietern @mrshenli @zhaojuanmao @pritamdamania87 @rohan-varma @mingzhe09088 # Distributed tests # This list is mostly if you'd like to be tagged as reviewer, feel free to add diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 03ad14dd843e..a1b4096592a7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -118,11 +118,37 @@ For example: - modify your Python file `torch/__init__.py` - test functionality -You do not need to repeatedly install after modifying Python files. +You do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall +if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...). In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch` and `python setup.py clean`. Then you can install in `develop` mode again. +### Tips and Debugging +* A prerequisite to installing PyTorch is CMake. We recommend installing it with [Homebrew](https://brew.sh/) +with `brew install cmake` if you are developing on MacOS or Linux system. +* Our `setup.py` requires Python >= 3.6 +* If you run into errors when running `python setup.py develop`, here are some debugging steps: + 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure + your CMake works and can compile this simple Hello World program without errors. + 2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many + details along the way, which saves time the next time you build. If you're running into issues, you can always + `rm -rf build` from the toplevel `pytorch` directory and start over. + 3. If you have made edits to the PyTorch repo, commit any change you'd like to keep and clean the repo with the + following commands (note that clean _really_ removes all untracked files and changes.): + ```bash + git submodule deinit -f . + git clean -xdf + python setup.py clean + git submodule update --init --recursive # very important to sync the submodules + python setup.py develop # then try running the command again + ``` + 4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to + experiment with some environment variables, you can pass them into the command: + ```bash + ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop + ``` + ## Nightly Checkout & Pull The `tools/nightly.py` script is provided to ease pure Python development of @@ -489,8 +515,7 @@ only interested in a specific component. - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to rebuild only that test binary (without rerunning cmake). (Replace `ninja` with `make` if you don't have ninja installed). -- Don't need Caffe2? Pass `BUILD_CAFFE2_OPS=0` to disable build of - Caffe2 operators. +- Don't need Caffe2? Pass `BUILD_CAFFE2=0` to disable Caffe2 build. On the initial build, you can also speed things up with the environment variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`. diff --git a/Dockerfile b/Dockerfile index d5619e1a8011..3706aa38b461 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ # For reference: # https://docs.docker.com/develop/develop-images/build_enhancements/ ARG BASE_IMAGE=ubuntu:18.04 -ARG PYTHON_VERSION=3.7 +ARG PYTHON_VERSION=3.8 FROM ${BASE_IMAGE} as dev-base RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ @@ -44,14 +44,15 @@ WORKDIR /opt/pytorch COPY --from=conda /opt/conda /opt/conda COPY --from=submodule-update /opt/pytorch /opt/pytorch RUN --mount=type=cache,target=/opt/ccache \ - TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ python setup.py install FROM conda as conda-installs ARG INSTALL_CHANNEL=pytorch-nightly -RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=10.1 && \ +RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y pytorch torchvision cudatoolkit=11.0.221 && \ /opt/conda/bin/conda clean -ya +RUN /opt/conda/bin/pip install torchelastic FROM ${BASE_IMAGE} as official LABEL com.nvidia.volumes.needed="nvidia_driver" diff --git a/README.md b/README.md index 6191cabcb685..c6c1138747a2 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,13 @@ On MacOS conda install pkg-config libuv ``` +On Windows +```bash +# Add these packages if torch.distributed is needed. +# Distributed package support on Windows is a prototype feature and is subject to changes. +conda install -c conda-forge libuv=1.39 +``` + #### Get the PyTorch Source ```bash git clone --recursive https://github.com/pytorch/pytorch diff --git a/android/README.md b/android/README.md index bf5fa02e6cf4..e67b2e6ec071 100644 --- a/android/README.md +++ b/android/README.md @@ -15,8 +15,8 @@ repositories { } dependencies { - implementation 'org.pytorch:pytorch_android:1.5.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.5.0' + implementation 'org.pytorch:pytorch_android:1.6.0' + implementation 'org.pytorch:pytorch_android_torchvision:1.6.0' } ``` @@ -34,12 +34,12 @@ repositories { dependencies { ... - implementation 'org.pytorch:pytorch_android:1.7.0-SNAPSHOT' - implementation 'org.pytorch:pytorch_android_torchvision:1.7.0-SNAPSHOT' + implementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT' + implementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT' ... } ``` -The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle.properties` in current folder, at this moment it is `1.7.0-SNAPSHOT`. +The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle.properties` in current folder, at this moment it is `1.8.0-SNAPSHOT`. ## Building PyTorch Android from Source diff --git a/android/gradle.properties b/android/gradle.properties index 6e0dc0ac86b0..0ab42c56396d 100644 --- a/android/gradle.properties +++ b/android/gradle.properties @@ -1,6 +1,6 @@ ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64 -VERSION_NAME=1.7.0-SNAPSHOT +VERSION_NAME=1.8.0-SNAPSHOT GROUP=org.pytorch MAVEN_GROUP=org.pytorch POM_URL=https://github.com/pytorch/pytorch/tree/master/android diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp index 11696daf43a2..fed6170c2bf3 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.cpp @@ -18,6 +18,17 @@ namespace pytorch_jni { +c10::DeviceType deviceJniCodeToDeviceType(jint deviceJniCode) { + if (deviceJniCode == kDeviceCPU) { + return at::kCPU; + } else if (deviceJniCode == kDeviceVulkan) { + return at::kVulkan; + } + + facebook::jni::throwNewJavaException( + facebook::jni::gJavaLangIllegalArgumentException, "Unknown device"); +} + bool Trace::is_initialized_ = false; #if defined(TRACE_ENABLED) && defined(__ANDROID__) diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_common.h b/android/pytorch_android/src/main/cpp/pytorch_jni_common.h index fb974d4ad702..9b4e7e5f84a1 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_common.h +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_common.h @@ -1,3 +1,5 @@ +#pragma once + #include #include @@ -18,6 +20,11 @@ namespace pytorch_jni { +constexpr static int kDeviceCPU = 1; +constexpr static int kDeviceVulkan = 2; + +c10::DeviceType deviceJniCodeToDeviceType(jint deviceJniCode); + class Trace { public: #if defined(TRACE_ENABLED) && defined(__ANDROID__) diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp index b05c19665f20..e4bb4c083160 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp @@ -67,22 +67,25 @@ class PytorchJni : public facebook::jni::HybridClass { private: friend HybridBase; torch::jit::Module module_; + c10::DeviceType deviceType_; public: constexpr static auto kJavaDescriptor = "Lorg/pytorch/NativePeer;"; static facebook::jni::local_ref initHybrid( facebook::jni::alias_ref, - facebook::jni::alias_ref modelPath) { - return makeCxxInstance(modelPath); + facebook::jni::alias_ref modelPath, + jint device) { + return makeCxxInstance(modelPath, device); } #ifdef __ANDROID__ static facebook::jni::local_ref initHybridAndroidAsset( facebook::jni::alias_ref, facebook::jni::alias_ref assetName, - facebook::jni::alias_ref assetManager) { - return makeCxxInstance(assetName, assetManager); + facebook::jni::alias_ref assetManager, + jint device) { + return makeCxxInstance(assetName, assetManager, device); } #endif @@ -127,17 +130,19 @@ class PytorchJni : public facebook::jni::HybridClass { ((void)once); } - PytorchJni(facebook::jni::alias_ref modelPath) { + PytorchJni(facebook::jni::alias_ref modelPath, jint device) { preModuleLoadSetup(); JITCallGuard guard; module_ = torch::jit::load(std::move(modelPath->toStdString())); module_.eval(); + deviceType_ = deviceJniCodeToDeviceType(device); } #ifdef __ANDROID__ PytorchJni( facebook::jni::alias_ref assetName, - facebook::jni::alias_ref assetManager) { + facebook::jni::alias_ref assetManager, + jint device) { preModuleLoadSetup(); JNIEnv* env = facebook::jni::Environment::current(); AAssetManager* mgr = AAssetManager_fromJava(env, assetManager.get()); @@ -166,6 +171,7 @@ class PytorchJni : public facebook::jni::HybridClass { assetBuffer, AAsset_getLength(asset))); AAsset_close(asset); module_.eval(); + deviceType_ = deviceJniCodeToDeviceType(device); } #endif @@ -191,7 +197,14 @@ class PytorchJni : public facebook::jni::HybridClass { inputs.reserve(n); for (size_t i = 0; i < n; i++) { at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i)); - inputs.push_back(std::move(atIValue)); + if (at::kVulkan == deviceType_) { + inputs.push_back( + atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()} + : std::move(atIValue)); + } else { + TORCH_CHECK(at::kCPU == deviceType_); + inputs.push_back(std::move(atIValue)); + } } auto output = [&]() { JITCallGuard guard; @@ -212,7 +225,14 @@ class PytorchJni : public facebook::jni::HybridClass { inputs.reserve(n); for (size_t i = 0; i < n; i++) { at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i)); - inputs.push_back(std::move(atIValue)); + if (at::kVulkan == deviceType_) { + inputs.push_back( + atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()} + : std::move(atIValue)); + } else { + TORCH_CHECK(at::kCPU == deviceType_); + inputs.push_back(std::move(atIValue)); + } } if (auto method = module_.find_method(methodName)) { auto output = [&]() { diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp index 061b85221fe9..8a96e395f267 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp @@ -30,9 +30,6 @@ struct LiteJITCallGuard { } // namespace class PytorchJni : public facebook::jni::HybridClass { - constexpr static int kDeviceCPU = 1; - constexpr static int kDeviceVulkan = 2; - private: friend HybridBase; torch::jit::mobile::Module module_; @@ -51,15 +48,7 @@ class PytorchJni : public facebook::jni::HybridClass { PytorchJni(facebook::jni::alias_ref modelPath, jint device) { LiteJITCallGuard guard; module_ = torch::jit::_load_for_mobile(std::move(modelPath->toStdString())); - if (device == kDeviceCPU) { - deviceType_ = at::kCPU; - } else if (device == kDeviceVulkan) { - deviceType_ = at::kVulkan; - } else { - facebook::jni::throwNewJavaException( - facebook::jni::gJavaLangIllegalArgumentException, - "Unknown device specified"); - } + deviceType_ = deviceJniCodeToDeviceType(device); } static void registerNatives() { @@ -108,7 +97,14 @@ class PytorchJni : public facebook::jni::HybridClass { inputs.reserve(n); for (size_t i = 0; i < n; i++) { at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i)); - inputs.push_back(std::move(atIValue)); + if (at::kVulkan == deviceType_) { + inputs.push_back( + atIValue.isTensor() ? at::IValue{atIValue.toTensor().vulkan()} + : std::move(atIValue)); + } else { + TORCH_CHECK(at::kCPU == deviceType_); + inputs.push_back(std::move(atIValue)); + } } if (auto method = module_.find_method(methodName)) { auto output = [&]() { diff --git a/android/pytorch_android/src/main/java/org/pytorch/Module.java b/android/pytorch_android/src/main/java/org/pytorch/Module.java index 9dafc687f993..62db7042d57b 100644 --- a/android/pytorch_android/src/main/java/org/pytorch/Module.java +++ b/android/pytorch_android/src/main/java/org/pytorch/Module.java @@ -11,16 +11,28 @@ public class Module { private INativePeer mNativePeer; /** - * Loads a serialized TorchScript module from the specified path on the disk. + * Loads a serialized TorchScript module from the specified path on the disk to run on specified + * device. * * @param modelPath path to file that contains the serialized TorchScript module. + * @param device {@link org.pytorch.Device} to use for running specified module. * @return new {@link org.pytorch.Module} object which owns torch::jit::Module. */ - public static Module load(final String modelPath) { + public static Module load(final String modelPath, final Device device) { if (!NativeLoader.isInitialized()) { NativeLoader.init(new SystemDelegate()); } - return new Module(new NativePeer(modelPath)); + return new Module(new NativePeer(modelPath, device)); + } + + /** + * Loads a serialized TorchScript module from the specified path on the disk to run on CPU. + * + * @param modelPath path to file that contains the serialized TorchScript module. + * @return new {@link org.pytorch.Module} object which owns torch::jit::Module. + */ + public static Module load(final String modelPath) { + return load(modelPath, Device.CPU); } Module(INativePeer nativePeer) { diff --git a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java index 5c6ef31061ae..76c0c6226755 100644 --- a/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java +++ b/android/pytorch_android/src/main/java/org/pytorch/NativePeer.java @@ -13,18 +13,23 @@ class NativePeer implements INativePeer { private final HybridData mHybridData; @DoNotStrip - private static native HybridData initHybrid(String moduleAbsolutePath); + private static native HybridData initHybrid(String moduleAbsolutePath, int deviceJniCode); @DoNotStrip private static native HybridData initHybridAndroidAsset( - String assetName, /* android.content.res.AssetManager */ Object androidAssetManager); + String assetName, /* android.content.res.AssetManager */ + Object androidAssetManager, + int deviceJniCode); - NativePeer(String moduleAbsolutePath) { - mHybridData = initHybrid(moduleAbsolutePath); + NativePeer(String moduleAbsolutePath, Device device) { + mHybridData = initHybrid(moduleAbsolutePath, device.jniCode); } - NativePeer(String assetName, /* android.content.res.AssetManager */ Object androidAssetManager) { - mHybridData = initHybridAndroidAsset(assetName, androidAssetManager); + NativePeer( + String assetName, /* android.content.res.AssetManager */ + Object androidAssetManager, + Device device) { + mHybridData = initHybridAndroidAsset(assetName, androidAssetManager, device.jniCode); } public void resetNative() { diff --git a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java index 15664dd040ea..b775c2bb2e2c 100644 --- a/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java +++ b/android/pytorch_android/src/main/java/org/pytorch/PyTorchAndroid.java @@ -21,9 +21,14 @@ public final class PyTorchAndroid { * *

This method is meant to use in tests and demos. */ + public static Module loadModuleFromAsset( + final AssetManager assetManager, final String assetName, final Device device) { + return new Module(new NativePeer(assetName, assetManager, device)); + } + public static Module loadModuleFromAsset( final AssetManager assetManager, final String assetName) { - return new Module(new NativePeer(assetName, assetManager)); + return new Module(new NativePeer(assetName, assetManager, Device.CPU)); } /** diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle index c592728ce9f4..37bdb35e2f19 100644 --- a/android/test_app/app/build.gradle +++ b/android/test_app/app/build.gradle @@ -40,6 +40,7 @@ android { buildConfigField("String", "LOGCAT_TAG", "@string/app_name") buildConfigField("long[]", "INPUT_TENSOR_SHAPE", "new long[]{1, 3, 224, 224}") buildConfigField("boolean", "NATIVE_BUILD", 'false') + buildConfigField("boolean", "USE_VULKAN_DEVICE", 'false') addManifestPlaceholders([APP_NAME: "@string/app_name", MAIN_ACTIVITY: "org.pytorch.testapp.MainActivity"]) } buildTypes { @@ -66,9 +67,17 @@ android { addManifestPlaceholders([APP_NAME: "MBQ"]) buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"") } + mbvulkan { + dimension "model" + applicationIdSuffix ".mbvulkan" + buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"") + buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true') + addManifestPlaceholders([APP_NAME: "MBQ"]) + buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"") + } resnet18 { dimension "model" - applicationIdSuffix ".resneti18" + applicationIdSuffix ".resnet18" buildConfigField("String", "MODULE_ASSET_NAME", "\"resnet18.pt\"") addManifestPlaceholders([APP_NAME: "RN18"]) buildConfigField("String", "LOGCAT_TAG", "\"pytorch-resnet18\"") @@ -122,7 +131,7 @@ android { tasks.all { task -> // Disable externalNativeBuild for all but nativeBuild variant - if (task.name.startsWith('externalNativeBuild') + if (task.name.startsWith('externalNativeBuild') && !task.name.contains('NativeBuild')) { task.enabled = false } @@ -140,8 +149,8 @@ dependencies { //nativeBuildImplementation(name: 'pytorch_android_torchvision-release', ext: 'aar') //extractForNativeBuild(name: 'pytorch_android-release', ext: 'aar') - nightlyImplementation 'org.pytorch:pytorch_android:1.7.0-SNAPSHOT' - nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.7.0-SNAPSHOT' + nightlyImplementation 'org.pytorch:pytorch_android:1.8.0-SNAPSHOT' + nightlyImplementation 'org.pytorch:pytorch_android_torchvision:1.8.0-SNAPSHOT' aarImplementation(name:'pytorch_android', ext:'aar') aarImplementation(name:'pytorch_android_torchvision', ext:'aar') diff --git a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java index 5cc233011c8a..bd7469950f87 100644 --- a/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java +++ b/android/test_app/app/src/main/java/org/pytorch/testapp/MainActivity.java @@ -17,6 +17,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.nio.FloatBuffer; +import org.pytorch.Device; import org.pytorch.IValue; import org.pytorch.Module; import org.pytorch.PyTorchAndroid; @@ -126,7 +127,11 @@ protected Result doModuleForward() { mInputTensorBuffer = Tensor.allocateFloatBuffer((int) numElements); mInputTensor = Tensor.fromBlob(mInputTensorBuffer, BuildConfig.INPUT_TENSOR_SHAPE); PyTorchAndroid.setNumThreads(1); - mModule = PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME); + mModule = + BuildConfig.USE_VULKAN_DEVICE + ? PyTorchAndroid.loadModuleFromAsset( + getAssets(), BuildConfig.MODULE_ASSET_NAME, Device.VULKAN) + : PyTorchAndroid.loadModuleFromAsset(getAssets(), BuildConfig.MODULE_ASSET_NAME); } final long startTime = SystemClock.elapsedRealtime(); diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 1bcbae8abeff..839964e33c59 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -51,6 +51,7 @@ file(GLOB cudnn_cpp "cudnn/*.cpp") file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h") file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp") +list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp") file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip") file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h") file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp") @@ -78,6 +79,7 @@ file(GLOB native_cuda_cu "native/cuda/*.cu") exclude(native_cuda_cu "${native_cuda_cu}" ${native_cuda_cu_sp}) file(GLOB native_cuda_cpp "native/cuda/*.cpp") file(GLOB native_cuda_h "native/cuda/*.h" "native/cuda/*.cuh") +file(GLOB native_hip_h "native/hip/*.h" "native/hip/*.cuh") file(GLOB native_cudnn_cpp "native/cudnn/*.cpp") file(GLOB native_sparse_cuda_cu "native/sparse/cuda/*.cu") file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") @@ -372,7 +374,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS}) if(NOT INTERN_BUILD_MOBILE) - list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${cudnn_h} ${hip_h} ${miopen_h}) + list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${miopen_h}) endif() # https://stackoverflow.com/questions/11096471/how-can-i-install-a-hierarchy-of-files-using-cmake diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 1496b6ee551d..1977f945a0fb 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -64,6 +64,11 @@ bool Context::deterministic() const { } void Context::setDeterministic(bool b) { + if (b) { + TORCH_WARN_ONCE("torch.set_deterministic is in beta, and its design and " + " functionality may change in the future."); + } + _deterministic = b; } @@ -230,4 +235,27 @@ Allocator* getCPUAllocator() { return getTHDefaultAllocator(); } +// override_allow_tf32_flag = true +// means the allow_tf32 flags are overrided and tf32 is force disabled +// override_allow_tf32_flag = false +// means the original allow_tf32 flags are followed +thread_local bool override_allow_tf32_flag = false; + +NoTF32Guard::NoTF32Guard() { + if (!override_allow_tf32_flag) { + changed = true; + override_allow_tf32_flag = true; + } +} + +NoTF32Guard::~NoTF32Guard() { + if (changed) { + override_allow_tf32_flag = false; + } +} + +bool NoTF32Guard::should_disable_tf32() { + return override_allow_tf32_flag; +} + } // namespace at diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index b8782209def5..fed5e88e5314 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -327,4 +327,20 @@ static inline void manual_seed(uint64_t seed) { } } +// When the global flag `allow_tf32` is set to true, cuBLAS handles are +// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH. +// For some operators, such as addmv, TF32 offers no performance improvement +// but causes precision loss. To help this case, this class implements +// a RAII guard that can be used to quickly disable TF32 within its scope. +// +// Usage: +// NoTF32Guard disable_tf32; +struct TORCH_API NoTF32Guard { + NoTF32Guard(); + ~NoTF32Guard(); + static bool should_disable_tf32(); +private: + bool changed = false; +}; + } // namespace at diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 786fe6214dc3..fd045960b52c 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -43,13 +43,10 @@ DLDataType getDLDataType(const Tensor& t) { throw std::logic_error("BFloat16 is not supported by dlpack"); break; case ScalarType::QInt8: - throw std::logic_error("QInt8 is not supported by dlpack"); - break; case ScalarType::QUInt8: - throw std::logic_error("QUInt8 is not supported by dlpack"); - break; case ScalarType::QInt32: - throw std::logic_error("QInt32 is not supported by dlpack"); + case ScalarType::QUInt4x2: + throw std::logic_error("QUInt/QInt types are not supported by dlpack"); break; case ScalarType::ComplexHalf: throw std::logic_error("ComplexHalf is not supported by dlpack"); diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 53a22db6ff9c..e0fc25c394d3 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -34,6 +34,21 @@ return __VA_ARGS__(); \ } +#define AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + enum_type, type, underlying_type, bitwidth, qmin, qmax, ...) \ + case enum_type: { \ + using scalar_t = type; \ + using underlying_t C10_UNUSED_DISPATCH_CUDA_WORKAROUND = \ + scalar_t::underlying; \ + const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type; \ + const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = \ + toUnderlying(enum_type); \ + int bit_width = bitwidth; \ + int64_t quant_min = qmin; \ + int64_t quant_max = qmax; \ + return __VA_ARGS__(); \ + } + // This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and // should be removed once the bfloat16 bringup is complete on other platforms. // This is supposed to be used as a wrapper around the lambda function passed to @@ -346,6 +361,25 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} } \ }() +#define AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + switch (_st) { \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQInt8, at::qint8, int8_t, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQUInt8, at::quint8, uint8_t, CHAR_BIT, 0, UCHAR_MAX, __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQInt32, at::qint32, int, CHAR_BIT * sizeof(int), INT_MIN, INT_MAX, __VA_ARGS__) \ + AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ + at::kQUInt4x2, at::quint4x2, uint8_t, 4, 0, 15, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } \ + }() + #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX(TYPE, NAME, ...) \ [&] { \ const auto& the_type = TYPE; \ diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp index 40b7ccafbd9a..f0a55470cc1c 100644 --- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp +++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp @@ -39,7 +39,7 @@ namespace { Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & source) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Bool: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_", false, DeviceType::CPU, dispatch_scalar_type); @@ -112,7 +112,7 @@ Tensor & _th_masked_scatter_(Tensor & self, const Tensor & mask, const Tensor & Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tensor & source) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Bool: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_masked_scatter_bool_", false, DeviceType::CPU, dispatch_scalar_type); @@ -185,7 +185,7 @@ Tensor & _th_masked_scatter_bool_(Tensor & self, const Tensor & mask, const Tens Tensor & _th_nonzero_out(Tensor & result, const Tensor & self) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Bool: { auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_nonzero_out", false, DeviceType::CPU, ScalarType::Long); @@ -316,7 +316,7 @@ Tensor _th_nonzero(const Tensor & self) { Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Bool: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_copy_", false, DeviceType::CPU, dispatch_scalar_type); @@ -379,135 +379,10 @@ Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const } return self; } -Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Bool: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THBoolTensor_take(result_, self_, index_); - break; - } - case ScalarType::Byte: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THByteTensor_take(result_, self_, index_); - break; - } - case ScalarType::Char: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THCharTensor_take(result_, self_, index_); - break; - } - case ScalarType::Double: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THDoubleTensor_take(result_, self_, index_); - break; - } - case ScalarType::Float: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THFloatTensor_take(result_, self_, index_); - break; - } - case ScalarType::Int: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THIntTensor_take(result_, self_, index_); - break; - } - case ScalarType::Long: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THLongTensor_take(result_, self_, index_); - break; - } - case ScalarType::Short: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CPU, ScalarType::Long); - THShortTensor_take(result_, self_, index_); - break; - } - default: - AT_ERROR("_th_take_out not supported on CPUType for ", dispatch_scalar_type); - } - return result; -} -Tensor _th_take(const Tensor & self, const Tensor & index) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); - switch (dispatch_scalar_type) { - case ScalarType::Bool: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THBoolTensor_take(result_, self_, index_); - break; - } - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THByteTensor_take(result_, self_, index_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THCharTensor_take(result_, self_, index_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THDoubleTensor_take(result_, self_, index_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THFloatTensor_take(result_, self_, index_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THIntTensor_take(result_, self_, index_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THLongTensor_take(result_, self_, index_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CPU, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CPU, ScalarType::Long); - THShortTensor_take(result_, self_, index_); - break; - } - default: - AT_ERROR("_th_take not supported on CPUType for ", dispatch_scalar_type); - } - return result; -} Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Bool: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_put_", false, DeviceType::CPU, dispatch_scalar_type); @@ -573,7 +448,7 @@ Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bo Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar value) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Bool: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_index_fill_", false, DeviceType::CPU, dispatch_scalar_type); @@ -639,7 +514,7 @@ Tensor & _th_index_fill_(Tensor & self, int64_t dim, const Tensor & index, Scala std::tuple _th_mode_out(Tensor & values, Tensor & indices, const Tensor & self, int64_t dim, bool keepdim) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Byte: { auto values_ = checked_dense_tensor_unwrap(values, "values", 0, "_th_mode_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -746,7 +621,7 @@ std::tuple _th_mode(const Tensor & self, int64_t dim, bool keepdi Tensor _th_var(const Tensor & self, bool unbiased) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_var", false, DeviceType::CPU, dispatch_scalar_type); @@ -765,7 +640,7 @@ Tensor _th_var(const Tensor & self, bool unbiased) { Tensor _th_std(const Tensor & self, bool unbiased) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_std", false, DeviceType::CPU, dispatch_scalar_type); @@ -784,7 +659,7 @@ Tensor _th_std(const Tensor & self, bool unbiased) { Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_renorm_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -835,7 +710,7 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_renorm_", false, DeviceType::CPU, dispatch_scalar_type); @@ -859,7 +734,7 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_histc_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -910,7 +785,7 @@ Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) { Tensor _th_trace(const Tensor & self) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Byte: { auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type); @@ -951,265 +826,10 @@ Tensor _th_trace(const Tensor & self) { AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type); } } -Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toByte(); - auto alpha_ = alpha.toByte(); - THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Char: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toChar(); - auto alpha_ = alpha.toChar(); - THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Double: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toDouble(); - auto alpha_ = alpha.toDouble(); - THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Float: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toFloat(); - auto alpha_ = alpha.toFloat(); - THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Int: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toInt(); - auto alpha_ = alpha.toInt(); - THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Long: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toLong(); - auto alpha_ = alpha.toLong(); - THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Short: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toShort(); - auto alpha_ = alpha.toShort(); - THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::BFloat16: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr_out", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toBFloat16(); - auto alpha_ = alpha.toBFloat16(); - THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - default: - AT_ERROR("_th_addr_out not supported on CPUType for ", dispatch_scalar_type); - } - return result; -} -Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toByte(); - auto alpha_ = alpha.toByte(); - THByteTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toChar(); - auto alpha_ = alpha.toChar(); - THCharTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toDouble(); - auto alpha_ = alpha.toDouble(); - THDoubleTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toFloat(); - auto alpha_ = alpha.toFloat(); - THFloatTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toInt(); - auto alpha_ = alpha.toInt(); - THIntTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toLong(); - auto alpha_ = alpha.toLong(); - THLongTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toShort(); - auto alpha_ = alpha.toShort(); - THShortTensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::BFloat16: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 2, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 3, "_th_addr", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toBFloat16(); - auto alpha_ = alpha.toBFloat16(); - THBFloat16Tensor_addr(result_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - default: - AT_ERROR("_th_addr not supported on CPUType for ", dispatch_scalar_type); - } - return result; -} -Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toByte(); - auto alpha_ = alpha.toByte(); - THByteTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toChar(); - auto alpha_ = alpha.toChar(); - THCharTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toDouble(); - auto alpha_ = alpha.toDouble(); - THDoubleTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toFloat(); - auto alpha_ = alpha.toFloat(); - THFloatTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toInt(); - auto alpha_ = alpha.toInt(); - THIntTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toLong(); - auto alpha_ = alpha.toLong(); - THLongTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toShort(); - auto alpha_ = alpha.toShort(); - THShortTensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - case ScalarType::BFloat16: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec1_ = checked_dense_tensor_unwrap(vec1, "vec1", 3, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto vec2_ = checked_dense_tensor_unwrap(vec2, "vec2", 4, "_th_addr_", false, DeviceType::CPU, dispatch_scalar_type); - auto beta_ = beta.toBFloat16(); - auto alpha_ = alpha.toBFloat16(); - THBFloat16Tensor_addr(self_, self_, vec1_, vec2_, beta_, alpha_); - break; - } - default: - AT_ERROR("_th_addr_ not supported on CPUType for ", dispatch_scalar_type); - } - return self; -} std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_gels_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1260,7 +880,7 @@ std::tuple _th_gels(const Tensor & self, const Tensor & A) { std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1307,7 +927,7 @@ std::tuple _th_eig(const Tensor & self, bool eigenvectors) { Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto output_ = checked_dense_tensor_unwrap(output, "output", 0, "_th_potri_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1350,7 +970,7 @@ Tensor _th_potri(const Tensor & self, bool upper) { std::tuple _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_geqrf_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1397,7 +1017,7 @@ std::tuple _th_geqrf(const Tensor & self) { Tensor & _th_orgqr_out(Tensor & result, const Tensor & self, const Tensor & input2) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_orgqr_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1444,7 +1064,7 @@ Tensor _th_orgqr(const Tensor & self, const Tensor & input2) { Tensor & _th_ormqr_out(Tensor & result, const Tensor & self, const Tensor & input2, const Tensor & input3, bool left, bool transpose) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_ormqr_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1495,7 +1115,7 @@ Tensor _th_ormqr(const Tensor & self, const Tensor & input2, const Tensor & inpu std::tuple _th_multinomial_alias_setup_out(Tensor & J, Tensor & q, const Tensor & probs) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(J); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto probs_ = checked_dense_tensor_unwrap(probs, "probs", 1, "_th_multinomial_alias_setup_out", false, DeviceType::CPU, dispatch_scalar_type); @@ -1542,7 +1162,7 @@ std::tuple _th_multinomial_alias_setup(const Tensor & probs) { Tensor & _th_multinomial_alias_draw_out(Tensor & result, const Tensor & q, const Tensor & J, int64_t num_samples, c10::optional generator) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(result); - + switch (dispatch_scalar_type) { case ScalarType::Double: { auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_multinomial_alias_draw_out", false, DeviceType::CPU, ScalarType::Long); diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h index e6e3fa0fb7e5..1bc9b66777bc 100644 --- a/aten/src/ATen/LegacyTHFunctionsCPU.h +++ b/aten/src/ATen/LegacyTHFunctionsCPU.h @@ -39,9 +39,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max); Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max); Tensor _th_trace(const Tensor & self); -Tensor & _th_addr_out(Tensor & result, const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha); -Tensor _th_addr(const Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha); -Tensor & _th_addr_(Tensor & self, const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha); std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A); std::tuple _th_gels(const Tensor & self, const Tensor & A); std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors); diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h index 6cbd974f51dd..d691fec1aa34 100644 --- a/aten/src/ATen/NumericUtils.h +++ b/aten/src/ATen/NumericUtils.h @@ -42,12 +42,12 @@ inline bool _isnan(T val) { template ::value, int>::type = 0> inline C10_HOST_DEVICE bool _isnan(T val) { - return at::_isnan(float(val)); + return at::_isnan(static_cast(val)); } inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) { - return at::_isnan(float(val)); + return at::_isnan(static_cast(val)); } template diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 7ed7f66e2522..6d74e2f47ce0 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -10,9 +10,8 @@ namespace at { ThreadLocalState::ThreadLocalState(bool keep_grad_mode) : dispatch_key_(c10::impl::tls_local_dispatch_key_set()), - debug_info_(c10::ThreadLocalDebugInfo::current()), - observers_enabled_(at::isRecordFunctionEnabled()) { - callbacks_ = _getTLSCallbacks(); + debug_info_(c10::ThreadLocalDebugInfo::current()) { + rf_tls_ = at::get_record_function_tls_(); #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) keep_grad_mode_ = keep_grad_mode; @@ -31,9 +30,7 @@ void ThreadLocalState::setThreadLocalState( } #endif - _setTLSCallbacks(state.callbacks_); - - at::enableRecordFunction(state.observers_enabled_); + at::set_record_function_tls_(state.rf_tls_); c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_); diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index 186e521f01bd..f0cb85f0ff84 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -30,10 +30,8 @@ class TORCH_API ThreadLocalState { // with DebugInfoGuard std::shared_ptr debug_info_; - // RecordFunction TLS callbacks - RecordFunctionCallbacks callbacks_; - - bool observers_enabled_ = false; + // RecordFunction TLS + RecordFunctionTLS rf_tls_; #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) bool keep_grad_mode_ = true; diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index c248ea461116..2768efe6e683 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -30,14 +30,15 @@ static inline int64_t maybe_wrap_dim(int64_t dim, const std::vector& dims, int64_t dim_post_expr) { +// wrap each dim in the dims array, taking dim_post_expr as the true number of dimensions +static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_post_expr) { if (dim_post_expr <= 0) { dim_post_expr = 1; // this will make range [-1, 0] } int64_t min = -dim_post_expr; int64_t max = dim_post_expr - 1; - for (auto& dim : dims) { + for (int64_t i = 0; i < ndims; ++i) { + auto &dim = dims[i]; if (dim < min || dim > max) { TORCH_CHECK_INDEX(false, "Dimension out of range (expected to be in range of [", @@ -47,6 +48,13 @@ static inline void maybe_wrap_dims(std::vector& dims, int64_t dim_post_ } } +// Wrap each dim in a contiguous container, taking dim_post_expr as the true number of dimensions +// E.g. could also be std::array or c10::SmallVector +template +inline void maybe_wrap_dims(Container& dims, int64_t dim_post_expr) { + return maybe_wrap_dims_n(dims.data(), dims.size(), dim_post_expr); +} + // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors // to be "skipped" (both for wrap dimension behavior and dimension size checking). diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index cb1ea44d2e7d..8f19cebb1f52 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -357,7 +357,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { KERNEL(ADD_NS(hinge_embedding_loss), "hinge_embedding_loss", Tensor (const Tensor &, const Tensor &, double, int64_t), fp32) KERNEL(ADD_NS(kl_div), "kl_div", Tensor (const Tensor &, const Tensor &, int64_t, bool), fp32) KERNEL(ADD_NS(l1_loss), "l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32) - KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32) + KERNEL(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32) KERNEL(ADD_NS(mse_loss), "mse_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32) KERNEL(ADD_NS(margin_ranking_loss), "margin_ranking_loss", Tensor (const Tensor &, const Tensor &, const Tensor &, double, int64_t), fp32) KERNEL(ADD_NS(multilabel_margin_loss), "multilabel_margin_loss", Tensor (const Tensor &, const Tensor &, int64_t), fp32) diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h index c5e4b0ea3c01..54481814be5b 100644 --- a/aten/src/ATen/core/aten_interned_strings.h +++ b/aten/src/ATen/core/aten_interned_strings.h @@ -502,6 +502,7 @@ _(aten, multinomial) \ _(aten, mv) \ _(aten, mvlgamma) \ _(aten, nansum) \ +_(aten, nan_to_num) \ _(aten, narrow) \ _(aten, narrow_copy) \ _(aten, native_batch_norm) \ @@ -611,6 +612,7 @@ _(aten, sigmoid) \ _(aten, sign) \ _(aten, signbit) \ _(aten, silu) \ +_(aten, sgn) \ _(aten, sin) \ _(aten, sinh) \ _(aten, size) \ diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index b5d552e0e31c..f84352ebee1f 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) { void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) { TORCH_INTERNAL_ASSERT(0, op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. " + "This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). " "If it's intended to override Math kernel behavior, please open an issue to request a dedicated " "Autograd dispatch key for the backend."); } diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 5fa379e40710..0942659d2960 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat } bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const { - for (auto k : ks) { - if (kernels_.find(k) != kernels_.end()) { - return true; - } + TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end()); + for (auto& kv : kernels_) { + if (ks.has(kv.first)) return true; } return false; } @@ -196,6 +195,9 @@ std::pair OperatorEntry::computeDispatchTab // In the past we directly call into backends(filled with catchAll) after BackendSelect. // Now that we first call Autograd backend keys after BackendSelect, we should fill those // with catchAll as well. + // The implementation of (2.1) & (2.3) relies on the invariant that for a given backend, + // `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the + // backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_] // (3) Use fallthrough kernel that are registered as fallback. // (4) Use catchAll kernel if available // Alias Key Precedence: @@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) { updateDispatchTableEntry_(dispatcher, k); } - // Registering to backend key might affect computed entry at its Autograd backend key due to 2.2. + // Note [Refresh Runtime Autograd entries in dispatchTable_] + // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3). DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key); updateDispatchTableEntry_(dispatcher, autograd_key); } diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index a9182787d2e6..a7b4e694d52e 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -156,18 +156,29 @@ struct FunctionSchema { checkSchema(); } - // check whether this schema is backward compatible with the old one. - // the following conditions are considered as this schema is backward - // compatible with old: - // 1) two schemas are equal - // 2) this schema has the same or more positional args than old, - // and any positional arg in this schema is backward compatible - // with the corresponding one in old schema, which could be an arg - // or a kwarg, if it has, or it must provide a default value - // 3) this schema has the same or more kwargs than old, and all the kwargs - // in old schema can find the corresponding kwarg in this schema which - // is backward compatible with the old kwarg, and the extra kwargs in - // this schema must provide default values. + // Checks whether this schema is backward compatible with the old one. + // The following conditions must be true: + // [Function structure] The new schema's name, overload-name, varargs, and + // return arity are the same. + // [Output Narrowing] The new schema's output type must be the same class + // or inherit from the old schema's output type. + // [Argument count] The new schema must have at least as many arguments as + // the old schema (considering the list of positional and kwargs). + // [Arg Compatibility] Every argument in the old schema has a corresponding + // argument in the new schema that: + // * is at the same position. + // * has the same name. + // * is either positional, or kwarg and the old argument was kwarg. + // * has the same type, or the old argument's type inherits from the + // new argument's type. + // [Default Values] Every new argument must have a default value. + // E.g. + // OK f_new(a, b, c=1) => f_old(a, b) + // NOK f_new(a, c=1, *, b) => f_old(a, *, b) + // OK f_new(a, b, *, c) => f_old(a, *, b, c) + // NOK f_new(a, *, b, c) -> f_old(a, b, *, c) + // NOK f_new(a, *, c, b) => f_old(a, *, b, c) + // OK f_new(a, *, b, c, d=1) => f_old(a, *, b, c) bool isBackwardCompatibleWith( const FunctionSchema& old, std::ostream* why_not = nullptr) const; diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index bc9a68fbad3f..2185b35bc593 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -111,69 +111,35 @@ inline bool FunctionSchema::isBackwardCompatibleWith( return false; } for (size_t i = 0; i < returns().size(); ++i) { - // functions are covariant in arguments but contravariant in returns + // Backwards compatibility requires covariance on argument types + // (i.e. more generic), and contravariance on return types (i.e. + // more specific). if (!old.returns().at(i).isBackwardCompatibleWith( returns().at(i), why_not)) { return false; } } - std::vector args, old_args; - std::map kwargs, old_kwargs; - auto split_func = [](const std::vector& arguments, - std::vector* positionals, - std::map* nameds) { - for (const Argument& arg : arguments) { - if (!arg.kwarg_only()) { - positionals->emplace_back(&arg); - } - nameds->emplace(arg.name(), &arg); - } - }; - // we split args into positional and keyward parts, - split_func(arguments(), &args, &kwargs); - split_func(old.arguments(), &old_args, &old_kwargs); - if (old_args.size() > args.size()) { - return false; - } - // make sure that all the old positional args have their corresponding - // backward compatible positional args in this schema - for (size_t i = 0; i < old_args.size(); ++i) { - if (!args.at(i)->isBackwardCompatibleWith( - *old_args.at(i), - why_not)) { + + // Make sure that all the old arguments have their corresponding backward + // compatible arguments in this schema. + for (size_t i = 0; i < old.arguments().size(); ++i) { + if (!arguments().at(i).isBackwardCompatibleWith( + old.arguments().at(i), why_not)) { return false; } } - // check the extra positional args in this schema either has corresponding - // backward compatible keyward args since positional args also can be used as - // a keyward arg, or provided default values - for (size_t i = old_args.size(); i < args.size(); ++i) { - if (!args.at(i)->default_value()) { - auto it = old_kwargs.find(args.at(i)->name()); - if (it == old_kwargs.end() || - !args.at(i)->isBackwardCompatibleWith( - *it->second, - why_not)) { - return false; + + // Validate that all new arguments provided a default value. + for (size_t i = old.arguments().size(); i < arguments().size(); ++i) { + if (!arguments().at(i).default_value()) { + if (why_not) { + *why_not + << "Function schema not backward compatible since the new argument '" + << arguments().at(i).name() << "' of type " + << arguments().at(i).type()->str() + << " did not provide a default value."; } - } - } - // make sure that all the keyword args in the old schema have their - // corresponding backward compatible keyward args in this schema - for (auto& kv : old_kwargs) { - auto it = kwargs.find(kv.first); - if (it == kwargs.end() || - !it->second->isBackwardCompatibleWith( - *kv.second, - why_not)) { - return false; - } - kwargs.erase(it); - } - // check all the extra keyword args in this schema provide default values - for (auto& kv : kwargs) { - if (!kv.second->default_value()) { return false; } } @@ -186,7 +152,6 @@ inline void FunctionSchema::checkArg( const Argument& argument, optional pos) const { if (!value.type()->isSubtypeOf(argument.type())) { - std::string position = pos ? ::c10::str(" in position ", *pos) : ""; TORCH_CHECK( false, formatTypeMismatchMsg( diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index bce5b27e37b1..69aaf167acee 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -59,6 +59,8 @@ namespace c10 { _(prim, Store) \ _(prim, AutogradZero) \ _(prim, AutogradAnyNonZero) \ + _(prim, AutogradAllNonZero) \ + _(prim, AutogradAllZero) \ _(prim, Starred) \ _(prim, TupleConstruct) \ _(prim, TupleUnpack) \ @@ -270,6 +272,7 @@ namespace c10 { _(prim, grad) \ _(aten, zero_) \ _(aten, fill_) \ + _(aten, masked_fill_) \ FORALL_ATEN_BASE_SYMBOLS(_) \ _(onnx, Add) \ _(onnx, Concat) \ @@ -357,7 +360,8 @@ namespace c10 { _(attr, scope) \ _(attr, keepdims) \ _(attr, cache_id) \ - _(attr, new_axis) + _(attr, new_axis) \ + _(attr, warn_id) #else #define FORALL_NS_SYMBOLS(_) \ _(namespaces, prim) \ diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 74eaa7012ac1..1f8cfbd242b9 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -263,7 +263,12 @@ struct SingleElementType : public Type { } protected: - SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) {} + SingleElementType(TypePtr elem) : Type(Kind), elem(std::move(elem)) { + if (!this->elem) { + throw std::runtime_error(c10::str( + "Can not create ", typeKindToString(Kind), " with None type")); + } + } private: TypePtr elem; @@ -483,6 +488,13 @@ struct CAFFE2_API SymbolicShape { dims_ = shape_symbols; } + ShapeSymbol operator[](size_t i) const { + if (!dims_) { + throw std::runtime_error("Rank isn't fixed"); + } + return (*dims_).at(i); + } + // Returns rank or nullopt in case of unranked shape. c10::optional rank() const { if(!dims_) { @@ -543,7 +555,7 @@ struct VaryingShape { return dims_ == other.dims_; } - const c10::optional& operator[](int i) const { + const c10::optional &operator[](size_t i) const { if (!dims_) { throw std::runtime_error("Rank isn't fixed"); } diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 475c59759f78..13e82d434647 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -716,6 +716,9 @@ TupleType::TupleType( schema_(std::move(schema)) { has_free_variables_ = std::any_of(elements_.begin(), elements_.end(), [](TypePtr v) { + if (!v) { + throw std::runtime_error("Can not create tuple with None type"); + } return v->hasFreeVariables(); }); if (schema_) { diff --git a/aten/src/ATen/cpu/vec256/vec256_base.h b/aten/src/ATen/cpu/vec256/vec256_base.h index 0e66cb357965..b6cc1db24028 100644 --- a/aten/src/ATen/cpu/vec256/vec256_base.h +++ b/aten/src/ATen/cpu/vec256/vec256_base.h @@ -239,6 +239,13 @@ struct Vec256 { // Specifically map() does not perform the type conversion needed by abs. return map([](T x) { return static_cast(std::abs(x)); }); } + + template ::value, int>::type = 0> + Vec256 sgn() const { + return map(at::native::sgn_impl); + } + template ::value, int>::type = 0> Vec256 angle() const { @@ -729,6 +736,14 @@ inline Vec256 operator^(const Vec256& a, const Vec256& b) { #endif +template>::value, int> = 0> +inline Vec256 operator~(const Vec256& a) { + Vec256 ones; // All bits are 1 + memset((T*) ones, 0xFF, 32); + return a ^ ones; +} + + template inline Vec256& operator += (Vec256& a, const Vec256& b) { a = a + b; diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h index fbc7a480a4c0..0827b33a3122 100644 --- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h @@ -134,6 +134,16 @@ template <> class Vec256> { auto angle = _mm256_permute_pd(angle_(), 0x05); // angle 90-angle return _mm256_and_pd(angle, real_mask); // angle 0 } + Vec256> sgn() const { + auto abs = abs_(); + auto zero = _mm256_setzero_pd(); + auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); + auto abs_val = Vec256(abs); + + auto div = values / abs_val.values; // x / abs(x) + + return blendv(div, zero, mask); + } __m256d real_() const { const __m256d real_mask = _mm256_castsi256_pd(_mm256_setr_epi64x(0xFFFFFFFFFFFFFFFF, 0x0000000000000000, 0xFFFFFFFFFFFFFFFF, 0x0000000000000000)); diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h index 892345e9d5c5..ea931acc494b 100644 --- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h @@ -171,6 +171,16 @@ template <> class Vec256> { auto angle = _mm256_permute_ps(angle_(), 0xB1); // angle 90-angle return _mm256_and_ps(angle, real_mask); // angle 0 } + Vec256> sgn() const { + auto abs = abs_(); + auto zero = _mm256_setzero_ps(); + auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); + auto abs_val = Vec256(abs); + + auto div = values / abs_val.values; // x / abs(x) + + return _mm256_blendv_ps(div, zero, mask); + } __m256 real_() const { const __m256 real_mask = _mm256_castsi256_ps(_mm256_setr_epi32(0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000)); diff --git a/aten/src/ATen/cpu/vec256/vec256_int.h b/aten/src/ATen/cpu/vec256/vec256_int.h index 98afd8bdd33c..30bf6421adb3 100644 --- a/aten/src/ATen/cpu/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec256/vec256_int.h @@ -104,6 +104,8 @@ class Vec256 : public Vec256i { } void store(void* ptr, int count = size()) const { if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); } else if (count > 0) { __at_align32__ int64_t tmp_values[size()]; @@ -228,6 +230,8 @@ class Vec256 : public Vec256i { } void store(void* ptr, int count = size()) const { if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); } else if (count > 0) { __at_align32__ int32_t tmp_values[size()]; @@ -449,6 +453,8 @@ class Vec256 : public Vec256i { } void store(void* ptr, int count = size()) const { if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); } else if (count > 0) { __at_align32__ int16_t tmp_values[size()]; @@ -699,6 +705,8 @@ class Vec256 : public Vec256i { } void store(void* ptr, int count = size()) const { if (count == size()) { + // ptr need not to be aligned here. See + // https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions/intrinsics-for-load-and-store-operations-1/mm256-storeu-si256.html _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values); } else if (count > 0) { __at_align32__ int8_t tmp_values[size()]; @@ -879,8 +887,8 @@ Vec256 inline operator*(const Vec256& a, const Vec256 template Vec256 inline int_elementwise_binary_256(const Vec256& a, const Vec256& b, Op op) { - __at_align32__ T values_a[Vec256::size()]; - __at_align32__ T values_b[Vec256::size()]; + T values_a[Vec256::size()]; + T values_b[Vec256::size()]; a.store(values_a); b.store(values_b); for (int i = 0; i != Vec256::size(); i++) { @@ -1039,6 +1047,10 @@ template>: inline Vec256 operator^(const Vec256& a, const Vec256& b) { return _mm256_xor_si256(a, b); } +template>::value, int> = 0> +inline Vec256 operator~(const Vec256& a) { + return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); +} Vec256 Vec256::eq(const Vec256& other) const { return (*this == other) & Vec256(1); diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 0311399649e7..26423889caa4 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -374,7 +374,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { // manually to be able to use tensor cores for FP16. On CUDA 11, this is no longer required. TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); } else { - AT_ERROR("BFloat16 gemm in CUDA requires Ampere or later GPU"); + TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU"); } } #endif @@ -407,19 +407,22 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { #endif #if !defined(__HIP_PLATFORM_HCC__) || (defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 210) - template <> - void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { - // See Note [Writing Nondeterministic Operations] - globalContext().alertCuBLASConfigNotDeterministic(); - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); - cublasOperation_t op = _cublasOpFromChar(trans); - _cublasAdjustLdLevel2(m, n, &lda); - GEMV_CHECK_ARGVALUES(c10::complex); - TORCH_CUDABLAS_CHECK( - cublasCgemv(handle, op, m, n, reinterpret_cast(&alpha), reinterpret_cast(a), - lda, reinterpret_cast(x), incx, reinterpret_cast(&beta), - reinterpret_cast(y), incy)); - } +template <> +void gemv>(CUDABLAS_GEMV_ARGTYPES(c10::complex)) { + // gemv is bw bound, and does not benefit from TF32. But the precision + // loss still happens on TF32. So we disable it here. + NoTF32Guard disable_tf32; + // See Note [Writing Nondeterministic Operations] + globalContext().alertCuBLASConfigNotDeterministic(); + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + cublasOperation_t op = _cublasOpFromChar(trans); + _cublasAdjustLdLevel2(m, n, &lda); + GEMV_CHECK_ARGVALUES(c10::complex); + TORCH_CUDABLAS_CHECK( + cublasCgemv(handle, op, m, n, reinterpret_cast(&alpha), reinterpret_cast(a), + lda, reinterpret_cast(x), incx, reinterpret_cast(&beta), + reinterpret_cast(y), incy)); +} #endif template <> @@ -436,6 +439,9 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(double)) { template <> void gemv(CUDABLAS_GEMV_ARGTYPES(float)) { + // gemv is bw bound, and does not benefit from TF32. But the precision + // loss still happens on TF32. So we disable it here. + NoTF32Guard disable_tf32; // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -492,46 +498,6 @@ void gemv(CUDABLAS_GEMV_ARGTYPES(at::BFloat16)) { } #endif -namespace { -template -cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, scalar_t *alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) { - TORCH_CHECK(false, "cublas ger is defined only for float and double"); - return {}; -} -template<> -cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, float *alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda) { - return cublasSger(handle, m, n, alpha, x, incx, y, incy, a, lda); -} -template<> -cublasStatus_t cublasGer(const cublasHandle_t &handle, int64_t m, int64_t n, double *alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda) { - return cublasDger(handle, m, n, alpha, x, incx, y, incy, a, lda); -} -} // anonymous namespace - -template -void ger(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda) -{ - _cublasAdjustLdLevel2(m, n, &lda); - TORCH_CHECK((m <= INT_MAX) && - (n <= INT_MAX) && - (lda <= INT_MAX) && - (incx <= INT_MAX) && - (incy <= INT_MAX), - "cublasSger/cublasDger only supports m, n, lda, incx, incy with " - "the bound [val] <= %d", INT_MAX); - int i_m = (int)m; - int i_n = (int)n; - int i_lda = (int)lda; - int i_incx = (int)incx; - int i_incy = (int)incy; - - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); - TORCH_CUDABLAS_CHECK(cublasGer( - handle, i_m, i_n, &alpha, x, i_incx, y, i_incy, a, i_lda)); -} -template void ger(int64_t m, int64_t n, float alpha, float *x, int64_t incx, float *y, int64_t incy, float *a, int64_t lda); -template void ger(int64_t m, int64_t n, double alpha, double *x, int64_t incx, double *y, int64_t incy, double *a, int64_t lda); - /* LEVEL 1 BLAS FUNCTIONS */ template <> diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 404f322545f8..0165c53ac60d 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -45,7 +45,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. - if (at::globalContext().allowTF32CuBLAS()) { + if (!NoTF32Guard::should_disable_tf32() && at::globalContext().allowTF32CuBLAS()) { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH)); } else { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h index 615ba3e92b71..80e39c6bc6bc 100644 --- a/aten/src/ATen/cuda/Exceptions.h +++ b/aten/src/ATen/cuda/Exceptions.h @@ -19,20 +19,23 @@ class CuDNNError : public c10::Error { } // namespace c10 +#define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__) + // See Note [CHECK macro] -#define AT_CUDNN_CHECK(EXPR) \ - do { \ - cudnnStatus_t status = EXPR; \ - if (status != CUDNN_STATUS_SUCCESS) { \ - if (status == CUDNN_STATUS_NOT_SUPPORTED) { \ - TORCH_CHECK_WITH(CuDNNError, false, \ - "cuDNN error: ", \ - cudnnGetErrorString(status), \ - ". This error may appear if you passed in a non-contiguous input."); \ - } else { \ - TORCH_CHECK_WITH(CuDNNError, false, "cuDNN error: ", cudnnGetErrorString(status)); \ - } \ - } \ +#define AT_CUDNN_CHECK(EXPR, ...) \ + do { \ + cudnnStatus_t status = EXPR; \ + if (status != CUDNN_STATUS_SUCCESS) { \ + if (status == CUDNN_STATUS_NOT_SUPPORTED) { \ + TORCH_CHECK_WITH(CuDNNError, false, \ + "cuDNN error: ", \ + cudnnGetErrorString(status), \ + ". This error may appear if you passed in a non-contiguous input.", ##__VA_ARGS__); \ + } else { \ + TORCH_CHECK_WITH(CuDNNError, false, \ + "cuDNN error: ", cudnnGetErrorString(status), ##__VA_ARGS__); \ + } \ + } \ } while (0) namespace at { namespace cuda { namespace blas { diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp index c43d53751aee..b2d8df49f51b 100644 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -435,144 +435,6 @@ Tensor & _th_index_copy_(Tensor & self, int64_t dim, const Tensor & index, const } return self; } -Tensor & _th_take_out(Tensor & result, const Tensor & self, const Tensor & index) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Bool: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Byte: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Char: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Double: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Float: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Int: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Long: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Short: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Half: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take_out", false, DeviceType::CUDA, ScalarType::Long); - THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - default: - AT_ERROR("_th_take_out not supported on CUDAType for ", dispatch_scalar_type); - } - return result; -} -Tensor _th_take(const Tensor & self, const Tensor & index) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); - switch (dispatch_scalar_type) { - case ScalarType::Bool: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaBoolTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaByteTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaCharTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaDoubleTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaIntTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaLongTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaShortTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_take", false, DeviceType::CUDA, dispatch_scalar_type); - auto index_ = checked_dense_tensor_unwrap(index, "index", 2, "_th_take", false, DeviceType::CUDA, ScalarType::Long); - THCudaHalfTensor_take(globalContext().getTHCState(), result_, self_, index_); - break; - } - default: - AT_ERROR("_th_take not supported on CUDAType for ", dispatch_scalar_type); - } - return result; -} Tensor & _th_put_(Tensor & self, const Tensor & index, const Tensor & source, bool accumulate) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 58f6a8d53e92..28b9738034e7 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -28,6 +28,10 @@ #include #endif +#ifndef USE_ROCM +#include +#endif + #include #include @@ -116,10 +120,14 @@ bool CUDAHooks::hasCuDNN() const { return AT_CUDNN_ENABLED(); } -#ifdef USE_DIRECT_NVRTC +#if defined(USE_DIRECT_NVRTC) static std::pair, at::cuda::NVRTC*> load_nvrtc() { return std::make_pair(nullptr, at::cuda::load_nvrtc()); } +#elif !defined(USE_ROCM) +static std::pair, at::cuda::NVRTC*> load_nvrtc() { + return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC); +} #else static std::pair, at::cuda::NVRTC*> load_nvrtc() { #if defined(_WIN32) diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp new file mode 100644 index 000000000000..fae48c08b61f --- /dev/null +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp @@ -0,0 +1,171 @@ +#include + +#include +#include +#include + +namespace at { +namespace cuda { +namespace detail { +namespace _stubs { + +at::DynamicLibrary& getCUDALibrary() { +#if defined(_WIN32) + static at::DynamicLibrary lib("nvcuda.dll"); +#else + static at::DynamicLibrary lib("libcuda.so.1"); +#endif + return lib; +} + +at::DynamicLibrary& getNVRTCLibrary() { + constexpr auto major = CUDA_VERSION / 1000; + constexpr auto minor = ( CUDA_VERSION / 10 ) % 10; +#if defined(_WIN32) + auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll"; +#else + static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor); +#endif + static at::DynamicLibrary lib(libname.c_str()); + return lib; +} + +#define _STUB_1(LIB, NAME, RETTYPE, ARG1) \ +RETTYPE NAME(ARG1 a1) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1); \ +} + +#define _STUB_2(LIB, NAME, RETTYPE, ARG1, ARG2) \ +RETTYPE NAME(ARG1 a1, ARG2 a2) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1, a2); \ +} + +#define _STUB_3(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3) \ +RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1, a2, a3); \ +} + +#define _STUB_4(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3, ARG4) \ +RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1, a2, a3, a4); \ +} + +#define CUDA_STUB1(NAME, A1) _STUB_1(CUDA, NAME, CUresult CUDAAPI, A1) +#define CUDA_STUB2(NAME, A1, A2) _STUB_2(CUDA, NAME, CUresult CUDAAPI, A1, A2) +#define CUDA_STUB3(NAME, A1, A2, A3) _STUB_3(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3) +#define CUDA_STUB4(NAME, A1, A2, A3, A4) _STUB_4(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3, A4) + +#define NVRTC_STUB1(NAME, A1) _STUB_1(NVRTC, NAME, nvrtcResult, A1) +#define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2) +#define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3) + +NVRTC_STUB2(nvrtcVersion, int*, int*); +NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const); + +nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, + const char * const *headers, + const char * const *includeNames) { + auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get nvrtcCreateProgram"); + lazyNVRTC.nvrtcCreateProgram = fn; + return fn(prog, src, name, numHeaders, headers, includeNames); +} + +NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *); +NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *); +NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); +NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); +_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); +NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); +NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); +NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **); + +CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *); +CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *); +CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t); +CUDA_STUB2(cuGetErrorString, CUresult, const char **); +CUDA_STUB1(cuCtxGetCurrent, CUcontext *); +CUDA_STUB1(cuModuleUnload, CUmodule); +CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *); +CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *); +CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *); + +// Irregularly shaped functions +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra) { + auto fn = reinterpret_cast(getCUDALibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get cuLaunchKernel"); + lazyNVRTC.cuLaunchKernel = fn; + return fn(f, + gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes, hStream, kernelParams, extra); +} + +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, + const void *image, + unsigned int numOptions, + CUjit_option *options, + void **optionValues) { + auto fn = reinterpret_cast(getCUDALibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get cuModuleLoadDataEx"); + lazyNVRTC.cuModuleLoadDataEx = fn; + return fn(module, image, numOptions, options, optionValues); +} + +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, + CUjitInputType type, + void *data, + size_t size, + const char *name, + unsigned int numOptions, + CUjit_option *options, + void **optionValues) { + auto fn = reinterpret_cast(getCUDALibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get cuLinkAddData"); + lazyNVRTC.cuLinkAddData = fn; + return fn(state, type, data, size, name, numOptions, options, optionValues); +} + +} // namespace _stubs + +NVRTC lazyNVRTC = { +#define _REFERENCE_MEMBER(name) _stubs::name, + AT_FORALL_NVRTC(_REFERENCE_MEMBER) +#undef _REFERENCE_MEMBER +}; +} // namespace detail +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.h b/aten/src/ATen/cuda/detail/LazyNVRTC.h new file mode 100644 index 000000000000..810e1c322dbd --- /dev/null +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.h @@ -0,0 +1,11 @@ +#pragma once +#include +namespace at { namespace cuda { +// Forward-declares at::cuda::NVRTC +struct NVRTC; + +namespace detail { +extern NVRTC lazyNVRTC; +} + +}} // at::cuda::detail diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index 4630465115c7..00e57ca63520 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -42,6 +42,7 @@ namespace at { namespace cuda { _(nvrtcGetProgramLog) \ _(nvrtcGetLoweredName) \ _(cuModuleLoadData) \ + _(cuModuleLoadDataEx) \ _(cuModuleGetFunction) \ _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \ _(cuGetErrorString) \ diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index 2863212a03a8..aba7b407162f 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -4,7 +4,6 @@ #include #include -#include namespace at { namespace native { @@ -144,4 +143,38 @@ void FilterDescriptor::set(const at::Tensor &t, int64_t pad, bool force_nhwc) { set(getDataType(t), (int) dim, size, filter_format); } +std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) { + switch (tformat) { + case CUDNN_TENSOR_NCHW: + return "CUDNN_TENSOR_NCHW"; + case CUDNN_TENSOR_NHWC: + return "CUDNN_TENSOR_NHWC"; + default: + std::ostringstream oss; + oss << "(unknown cudnn tensor format " << static_cast(tformat) << ")"; + return oss.str(); + } +} + +std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) { + out << "FilterDescriptor " << static_cast(d.desc()) << "\n"; + int nbDims; + int dimA[CUDNN_DIM_MAX]; + cudnnDataType_t dtype; + cudnnTensorFormat_t tformat; + cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA); + out << " type = " << cudnnTypeToString(dtype) << "\n"; + out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n"; + out << " nbDims = " << nbDims << "\n"; + // Read out only nbDims of the arrays! + out << " dimA = "; + for (auto i : ArrayRef{dimA, static_cast(nbDims)}) { + out << i << ", "; + } + out << "\n"; + return out; +} + +void FilterDescriptor::print() { std::cout << *this; } + }} diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 04e027491709..2aed3f66632f 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -1,5 +1,7 @@ #pragma once +#include + #include #include @@ -12,6 +14,8 @@ namespace at { namespace native { +std::string cudnnTypeToString(cudnnDataType_t dtype); + // TODO: Add constructors for all of the descriptors inline int dataSize(cudnnDataType_t dataType) @@ -153,12 +157,15 @@ class TORCH_CUDA_API FilterDescriptor public: void set(const at::Tensor &t, int64_t pad = 0, bool force_nhwc = false); + void print(); private: void set(cudnnDataType_t dataType, int dim, int* size, cudnnTensorFormat_t filter_format) { AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, filter_format, dim, size)); } }; +std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d); + struct TORCH_CUDA_API ConvolutionDescriptor : public Descriptor *a, int *lda, double *w, std::complex *work, int *lwork, double *rwork, int *info); +extern "C" void cheev_(char *jobz, char *uplo, int *n, std::complex *a, int *lda, float *w, std::complex *work, int *lwork, float *rwork, int *info); extern "C" void dsyev_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *info); extern "C" void ssyev_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *info); // gesdd extern "C" void zgesdd_(char *jobz, int *m, int *n, std::complex *a, int *lda, - double *s, std::complex *u, int *ldu, std::complex *vt, int *ldvt, std::complex *work, int *lwork, int *rwork, int *iwork, int *info); + double *s, std::complex *u, int *ldu, std::complex *vt, int *ldvt, std::complex *work, int *lwork, double *rwork, int *iwork, int *info); extern "C" void cgesdd_(char *jobz, int *m, int *n, std::complex *a, int *lda, - float *s, std::complex *u, int *ldu, std::complex *vt, int *ldvt, std::complex *work, int *lwork, int *rwork, int *iwork, int *info); + float *s, std::complex *u, int *ldu, std::complex *vt, int *ldvt, std::complex *work, int *lwork, float *rwork, int *iwork, int *info); extern "C" void dgesdd_(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *iwork, int *info); extern "C" void sgesdd_(char *jobz, int *m, int *n, float *a, int *lda, @@ -116,12 +118,12 @@ void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *wo template void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info); -template -void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, scalar_t *w, scalar_t *work, int lwork, int *info); +template +void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info); template void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, - value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, int *rwork, int *iwork, int *info); + value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info); template void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info); @@ -255,33 +257,43 @@ template<> void lapackOrgqr(int m, int n, int k, float *a, int lda, float sorgqr_(&m, &n, &k, a, &lda, tau, work, &lwork, info); } -template<> void lapackSymeig(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, int *info) { +template<> void lapackSymeig, double>(char jobz, char uplo, int n, c10::complex *a, int lda, double *w, c10::complex *work, int lwork, double *rwork, int *info) { + zheev_(&jobz, &uplo, &n, reinterpret_cast*>(a), &lda, w, reinterpret_cast*>(work), &lwork, rwork, info); +} + +template<> void lapackSymeig, float>(char jobz, char uplo, int n, c10::complex *a, int lda, float *w, c10::complex *work, int lwork, float *rwork, int *info) { + cheev_(&jobz, &uplo, &n, reinterpret_cast*>(a), &lda, w, reinterpret_cast*>(work), &lwork, rwork, info); +} + +template<> void lapackSymeig(char jobz, char uplo, int n, double *a, int lda, double *w, double *work, int lwork, double* rwork, int *info) { + (void)rwork; // unused dsyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info); } -template<> void lapackSymeig(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, int *info) { +template<> void lapackSymeig(char jobz, char uplo, int n, float *a, int lda, float *w, float *work, int lwork, float* rwork, int *info) { + (void)rwork; // unused ssyev_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, info); } template<> void lapackSvd, double>(char jobz, int m, int n, c10::complex *a, int lda, - double *s, c10::complex *u, int ldu, c10::complex *vt, int ldvt, c10::complex *work, int lwork, int *rwork, int *iwork, int *info) { + double *s, c10::complex *u, int ldu, c10::complex *vt, int ldvt, c10::complex *work, int lwork, double *rwork, int *iwork, int *info) { zgesdd_(&jobz, &m, &n, reinterpret_cast*>(a), &lda, s, reinterpret_cast*>(u), &ldu, reinterpret_cast*>(vt), &ldvt, reinterpret_cast*>(work), &lwork, rwork, iwork, info); } template<> void lapackSvd, float>(char jobz, int m, int n, c10::complex *a, int lda, - float *s, c10::complex *u, int ldu, c10::complex *vt, int ldvt, c10::complex *work, int lwork, int *rwork, int *iwork, int *info) { + float *s, c10::complex *u, int ldu, c10::complex *vt, int ldvt, c10::complex *work, int lwork, float *rwork, int *iwork, int *info) { cgesdd_(&jobz, &m, &n, reinterpret_cast*>(a), &lda, s, reinterpret_cast*>(u), &ldu, reinterpret_cast*>(vt), &ldvt, reinterpret_cast*>(work), &lwork, rwork, iwork, info); } template<> void lapackSvd(char jobz, int m, int n, double *a, int lda, - double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, int *rwork, int *iwork, int *info) { + double *s, double *u, int ldu, double *vt, int ldvt, double *work, int lwork, double *rwork, int *iwork, int *info) { dgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info); } template<> void lapackSvd(char jobz, int m, int n, float *a, int lda, - float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, int *rwork, int *iwork, int *info) { + float *s, float *u, int ldu, float *vt, int ldvt, float *work, int lwork, float *rwork, int *iwork, int *info) { sgesdd_(&jobz, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, iwork, info); } @@ -859,7 +871,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool #else using value_t = typename c10::scalar_value_type::type; auto self_data = self.data_ptr(); - auto eigvals_data = eigvals.data_ptr(); + auto eigvals_data = eigvals.data_ptr(); auto self_matrix_stride = matrixStride(self); auto eigvals_stride = eigvals.size(-1); auto batch_size = batchCount(self); @@ -875,16 +887,26 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty() int lwork = -1; scalar_t wkopt; - lapackSymeig(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, &info); + + Tensor rwork; + value_t* rwork_data = nullptr; + if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { + int64_t lrwork = std::max(int64_t(1), 3 * n - 2); + ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); + rwork = at::empty({lrwork}, self.options().dtype(dtype)); + rwork_data = rwork.data_ptr(); + } + + lapackSymeig(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info); lwork = static_cast(real_impl(wkopt)); Tensor work = at::empty({lwork}, self.options()); for (int64_t i = 0; i < batch_size; i++) { scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; - scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride]; + value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride]; // now compute the eigenvalues and the eigenvectors (optionally) - lapackSymeig(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr(), lwork, &info); + lapackSymeig(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, work.data_ptr(), lwork, rwork_data, &info); infos[i] = info; if (info != 0) { return; @@ -898,14 +920,15 @@ std::tuple _symeig_helper_cpu(const Tensor& self, bool eigenvect auto self_sizes = self.sizes().vec(); self_sizes.pop_back(); - auto eigvals = at::empty(self_sizes, self.options()); + ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); + auto eigvals = at::empty(self_sizes, self.options().dtype(dtype)); if (self.numel() == 0) { return std::tuple(eigvals, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT)); } auto self_working_copy = cloneBatchedColumnMajor(self); - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cpu", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cpu", [&]{ apply_symeig(self_working_copy, eigvals, eigenvectors, upper, infos); }); @@ -958,22 +981,15 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT, auto m = self.size(-2); auto n = self.size(-1); auto mn = std::min(m, n); - Tensor iwork = at::empty({8*mn}, at::kInt); + Tensor iwork = at::empty({8 * mn}, at::kInt); auto iwork_data = iwork.data_ptr(); Tensor rwork; - int* rwork_data = nullptr; + value_t* rwork_data = nullptr; if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { - auto mx = std::max(m, n); - int64_t lrwork; // These settings are valid for on LAPACK 3.6+ - if (jobz == 'N'){ - lrwork = 7 * mn; - }else if (mx > 10 * mn){ - lrwork = 7 * mn * mn + 7 * mn; - } else { - lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn); - } - rwork = at::empty({std::max(int64_t(1), lrwork)}, at::kInt); - rwork_data = rwork.data_ptr(); + auto lrwork = computeLRWorkDim(jobz, m, n); + // rwork is an array of floats or doubles depending on the type + rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype())); + rwork_data = rwork.data_ptr(); } // Run once, first to get the optimum work size. @@ -992,7 +1008,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT, value_t* S_working_ptr = &S_data[i * S_stride]; scalar_t* U_working_ptr = &U_data[i * U_stride]; scalar_t* VT_working_ptr = &VT_data[i * VT_stride]; - + // Compute S, U (optionally) and VT (optionally) lapackSvd(jobz, m, n, self_working_ptr, m, S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work_data, lwork, rwork_data, iwork_data, &info); @@ -1008,7 +1024,7 @@ std::tuple _svd_helper_cpu(const Tensor& self, bool some std::vector infos(batchCount(self), 0); int64_t m = self.size(-2), n = self.size(-1); int64_t k = std::min(m, n); - + char jobz = compute_uv ? (some ? 'S' : 'A') : 'N'; Tensor U_working_copy, S_working_copy, VT_working_copy; diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index fc55379578ff..f8af756773c9 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -175,7 +175,7 @@ Tensor& divide_(Tensor& self, Scalar other) { // true_divide, an alias for div Tensor& true_divide_out(Tensor& result, const Tensor& self, const Tensor& divisor) { - return native::div_out(result, self, divisor); + return at::div_out(result, self, divisor); } Tensor true_divide(const Tensor& self, const Tensor& divisor) { @@ -390,14 +390,16 @@ Tensor rsub(const Tensor& self, const Tensor& other, Scalar alpha) { } Tensor& atan2_out(Tensor& result, const Tensor& self, const Tensor& other) { - auto iter = TensorIterator::binary_op(result, self, other); + auto iter = TensorIterator::binary_float_op(result, self, other); atan2_stub(iter.device_type(), iter); return result; } Tensor atan2(const Tensor& self, const Tensor& other) { - Tensor result = at::empty({0}, self.options()); - return native::atan2_out(result, self, other); + Tensor result; + auto iter = TensorIterator::binary_float_op(result, self, other); + atan2_stub(iter.device_type(), iter); + return iter.output(); } Tensor& atan2_(Tensor& self, const Tensor& other) { diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h index e2dad35eb7ec..7640c8bd84ac 100644 --- a/aten/src/ATen/native/BinaryOps.h +++ b/aten/src/ATen/native/BinaryOps.h @@ -10,7 +10,8 @@ namespace at { namespace native { inline void alpha_check(const ScalarType dtype, Scalar alpha) { TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool, "Boolean alpha only supported for Boolean results."); - TORCH_CHECK(isFloatingType(dtype) || alpha.isIntegral(true), + TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype) + || alpha.isIntegral(true), "For integral input tensors, argument alpha must not be a floating point number."); } @@ -25,6 +26,7 @@ inline void sub_check(const Tensor& self, const Tensor& other) { } using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha); +using binary_fn_beta = void(*)(TensorIterator&, double beta); using binary_fn = void(*)(TensorIterator&); using binary_clamp_fn_alpha = void(*)(TensorIterator&, Scalar alpha, Scalar min_val, Scalar max_val); @@ -54,7 +56,7 @@ DECLARE_DISPATCH(binary_fn, max_elementwise_stub); DECLARE_DISPATCH(binary_fn, min_elementwise_stub); DECLARE_DISPATCH(binary_fn, maximum_stub); DECLARE_DISPATCH(binary_fn, minimum_stub); -DECLARE_DISPATCH(binary_fn, smooth_l1_stub); +DECLARE_DISPATCH(binary_fn_beta, smooth_l1_stub); DECLARE_DISPATCH(binary_fn, sigmoid_backward_stub); DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub); DECLARE_DISPATCH(binary_fn, tanh_backward_stub); diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h index b8830691f47a..3fde6dbb77e1 100644 --- a/aten/src/ATen/native/ComplexHelper.h +++ b/aten/src/ATen/native/ComplexHelper.h @@ -4,12 +4,25 @@ namespace at { namespace native { -inline std::vector computeStrideForViewAsReal(IntArrayRef oldstride) { - auto res = oldstride.vec(); - for(size_t i = 0; i < res.size(); i++) { - res[i] = res[i] * 2; +// View tensor with new dtype, storage offset, sizes and strides +inline Tensor view_tensor( + const Tensor &tensor, ScalarType dtype, + int64_t offset, IntArrayRef sizes, IntArrayRef strides) { + Storage storage = tensor.storage(); + auto new_tensor = detail::make_tensor( + std::move(storage), tensor.key_set(), scalarTypeToTypeMeta(dtype)); + auto * impl = new_tensor.unsafeGetTensorImpl(); + impl->set_storage_offset(offset); + impl->set_sizes_and_strides(sizes, strides); + return new_tensor; +} + +inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) { + DimVector res(oldstride.size() + 1); + for(size_t i = 0; i < oldstride.size(); i++) { + res[i] = oldstride[i] * 2; } - res.emplace_back(1); + res.back() = 1; return res; } @@ -18,25 +31,25 @@ inline std::vector computeStrideForViewAsReal(IntArrayRef oldstride) { // in the last two dimensions Tensor view_as_real(const Tensor& self) { TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors"); - auto new_sizes = self.sizes().vec(); + auto old_sizes = self.sizes(); + DimVector new_sizes(old_sizes.size() + 1); + std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin()); // last dimension will always have two elements containing the real and imag vals - new_sizes.emplace_back(2); + new_sizes.back() = 2; auto new_strides = computeStrideForViewAsReal(self.strides()); auto new_storage_offset = 2 * self.storage_offset(); const auto float_type = c10::toValueType(self.scalar_type()); - return at::empty({0}, self.options().dtype(float_type)).set_(self.storage(), new_storage_offset, new_sizes, new_strides); + return view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides); } -inline std::vector computeStrideForViewAsComplex(IntArrayRef oldstride) { - auto res = oldstride.vec(); - int dim = res.size(); - - TORCH_CHECK(res[dim-1] == 1, "Tensor must have a last dimension with stride 1"); - res.pop_back(); +inline DimVector computeStrideForViewAsComplex(IntArrayRef oldstride) { + const int64_t dim = oldstride.size(); + TORCH_CHECK(oldstride[dim-1] == 1, "Tensor must have a last dimension with stride 1"); - for (auto i = decltype(res.size()){0}; i < res.size(); i++) { - TORCH_CHECK(res[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension"); - res[i] = res[i] / 2; + DimVector res(dim - 1); + for (int64_t i = 0; i < res.size(); i++) { + TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension"); + res[i] = oldstride[i] / 2; } return res; } @@ -48,10 +61,10 @@ Tensor view_as_complex(const Tensor& self) { self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf, "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type()); - TORCH_CHECK(self.dim() != 0, "Input tensor must have one or more dimensions"); - auto new_sizes = self.sizes().vec(); - TORCH_CHECK(new_sizes[self.dim()-1] == 2, "Tensor must have a last dimension of size 2"); - new_sizes.pop_back(); + auto old_sizes = self.sizes(); + TORCH_CHECK(old_sizes.size() != 0, "Input tensor must have one or more dimensions"); + TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2"); + DimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1); const auto new_strides = computeStrideForViewAsComplex(self.strides()); const auto complex_type = c10::toComplexType(self.scalar_type()); @@ -59,7 +72,7 @@ Tensor view_as_complex(const Tensor& self) { TORCH_CHECK(self.storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2"); const auto new_storage_offset = self.storage_offset() / 2; - return at::empty({0}, self.options().dtype(complex_type)).set_(self.storage(), new_storage_offset, new_sizes, new_strides); + return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides); } }} // namespace at::native diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index ea7903369e93..aa3a2debfe0a 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -62,6 +62,7 @@ std::ostream& operator<<(std::ostream & out, const ConvParams& params) { << " benchmark = " << params.benchmark << " deterministic = " << params.deterministic << " cudnn_enabled = " << params.cudnn_enabled + << " allow_tf32 = " << params.allow_tf32 << "}"; return out; } @@ -198,6 +199,9 @@ auto ConvParams::use_cudnn(const at::Tensor& input, const at::Tensor& weight) co if (!input.is_cuda() || !cudnn_enabled) { return false; } + if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { + return false; + } if (!cudnn_conv_use_channels_last(input, weight)) { // bypass dilation checks for channels-last convolution if (deterministic && is_dilated()) { // cudnn doesn't support deterministic dilated convolution fully yet diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 9bc6b476e221..6a0ca1e67900 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -191,11 +191,10 @@ static void slow_conv2d_update_output_frame( output.reshape({n_output_plane, output_height * output_width}); if (bias.defined()) { output.copy_(bias.unsqueeze(-1).unsqueeze(-1)); + output2d.addmm_(weight, finput, 1, 1); } else { - output.zero_(); + output2d.addmm_(weight, finput, 0, 1); } - - output2d.addmm_(weight, finput, 1, 1); } void slow_conv2d_backward_update_grad_input_frame( @@ -434,16 +433,23 @@ std::tuple slow_conv2d_forward_out_cpu( const int64_t batch_size = input.size(0); - finput.resize_({batch_size, + if ((input.ndimension() == 4) && (kernel_height == 1) && (stride_height == 1) && (pad_height == 0) && + (kernel_width == 1) && (stride_width == 1) && (pad_width == 0)) { + finput = + input.view({batch_size, n_input_plane, output_height * output_width}) + .detach(); + } else { + finput.resize_({batch_size, n_input_plane * kernel_height * kernel_width, output_height * output_width}); + } output.resize_({batch_size, n_output_plane, output_height, output_width}); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { NoGradGuard no_grad; AutoNonVariableTypeMode non_variable_type_mode; for (int64_t t = start; t < end; t++) { - Tensor input_t = input[t]; + Tensor input_t = input[t].unsqueeze(0); Tensor output_t = output[t]; Tensor finput_t = finput[t]; slow_conv2d_update_output_frame( diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp index d5a29a3abbe1..95263617e2a8 100644 --- a/aten/src/ATen/native/ConvolutionMM3d.cpp +++ b/aten/src/ATen/native/ConvolutionMM3d.cpp @@ -581,9 +581,15 @@ std::tuple slow_conv3d_forward_out_cpu( (input_width + 2 * pad_width - kernel_width) / stride_width + 1; const int64_t batch_size = input.size(0); - finput.resize_({batch_size, - n_input_plane * kernel_depth * kernel_height * kernel_width, - output_depth * output_height * output_width}); + if ((kernel_depth == 1) && (kernel_height == 1) && (kernel_width == 1) && + (pad_depth == 0) && (pad_height == 0) && (pad_width == 0) && + (stride_depth == 1) && (stride_height == 1) && (stride_width == 1) && (groups == 1)) { + finput = input.view({batch_size, n_input_plane, output_height * output_width * output_depth}).detach(); + } else { + finput.resize_({batch_size, + n_input_plane * kernel_depth * kernel_height * kernel_width, + output_depth * output_height * output_width}); + } output.resize_( {batch_size, n_output_plane, output_depth, output_height, output_width}); diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 79fb0a11fba4..23b81a655507 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -34,7 +34,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { } Tensor buf = empty({BLOCK_SZ, BLOCK_SZ}, self.options()); - AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, self.scalar_type(), "copy_", [&] { scalar_t* sp = src.data_ptr(); scalar_t* rp = self.data_ptr(); scalar_t* bp = buf.data_ptr(); diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 912b5116c4cc..73eb2070c07d 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -24,6 +24,26 @@ std::vector foreach_tensor_##NAME##_scalar_kernel_slow(TensorList tensor return result; \ } +#define FOREACH_BINARY_OP_SCALARLIST(NAME) \ +void foreach_tensor_##NAME##_scalarlist_kernel_slow_(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors, scalars); \ + \ + for (int i = 0; i < tensors.size(); i++) { \ + tensors[i].NAME##_(scalars[i]); \ + } \ +} \ + \ +std::vector foreach_tensor_##NAME##_scalarlist_kernel_slow(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors, scalars); \ + std::vector result; \ + result.reserve(tensors.size()); \ + for (int i = 0; i < tensors.size(); i++) { \ + result.emplace_back(tensors[i].NAME(scalars[i])); \ + } \ + \ + return result; \ +} + #define FOREACH_BINARY_OP_LIST(NAME) \ std::vector foreach_tensor_##NAME##_list_kernel_slow(TensorList tensors1, TensorList tensors2) { \ check_foreach_api_restrictions(tensors1, tensors2); \ @@ -117,6 +137,10 @@ FOREACH_BINARY_OP_SCALAR(add); FOREACH_BINARY_OP_SCALAR(sub); FOREACH_BINARY_OP_SCALAR(mul); FOREACH_BINARY_OP_SCALAR(div); +FOREACH_BINARY_OP_SCALARLIST(add); +FOREACH_BINARY_OP_SCALARLIST(sub); +FOREACH_BINARY_OP_SCALARLIST(mul); +FOREACH_BINARY_OP_SCALARLIST(div); FOREACH_BINARY_OP_LIST(mul); FOREACH_BINARY_OP_LIST(div); FOREACH_UNARY_OP(sqrt); diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 5a7aced74702..f634d4804a6d 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -31,6 +31,12 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) { } } +void check_foreach_api_restrictions(TensorList tensors, ArrayRef scalars) { + TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor."); + TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value."); + TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list."); +} + // To go via 'fast' path, several conditions must be satisfied // - All tensors must be on the same device // - All tensors must have strided layout @@ -68,7 +74,7 @@ bool can_use_fast_route(TensorList tensors, Scalar scalar) { return false; } - // integral scalar + boolean tensor will result in integral tensor + // integral scalar + boolean tensor will result in integral tensor if (scalar.isIntegral(/*includeBool*/ false) && t.dtype() == at::kBool) { return false; } @@ -83,17 +89,17 @@ bool can_use_fast_route(TensorList tensors1, TensorList tensors2) { for (int64_t i = 0; i < tensors1.size(); i++) { TORCH_CHECK(tensors1[i].sizes() == tensors2[i].sizes(), "Corresponding tensors from tensor lists have different size."); - if (tensors1[i].device() != expected_device || + if (tensors1[i].device() != expected_device || tensors2[i].device() != expected_device) { return false; } - if (tensors1[i].layout() != at::kStrided || + if (tensors1[i].layout() != at::kStrided || tensors2[i].layout() != at::kStrided) { return false; } - if (tensors1[i].device() != expected_device || + if (tensors1[i].device() != expected_device || tensors2[i].device() != expected_device) { return false; } @@ -102,7 +108,7 @@ bool can_use_fast_route(TensorList tensors1, TensorList tensors2) { return false; } - if (!tensors1[i].is_non_overlapping_and_dense() || + if (!tensors1[i].is_non_overlapping_and_dense() || !tensors2[i].is_non_overlapping_and_dense()) { return false; } @@ -132,5 +138,13 @@ bool can_use_fast_route(TensorList tensors) { return true; } +bool can_use_fast_route(TensorList tensors, ArrayRef scalars) { + TORCH_CHECK(tensors.size() > 0, "Tensor list must have at least one tensor."); + TORCH_CHECK(scalars.size() > 0, "Scalars list must have at least one value."); + TORCH_CHECK(tensors.size() == scalars.size(), "Tensor list must have same number of elements as scalar list."); + + return can_use_fast_route(tensors); +} + } }} // at::native diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index e93eb11f642c..9c3742c129de 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -143,50 +143,61 @@ static void check_1d(const Tensor& t, const char* arg, const char* fn) { } Tensor addr(const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) { - check_1d(vec1, "vec1", "addr"); - check_1d(vec2, "vec2", "addr"); - Tensor b_self; - std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr"); - return at::_addr(b_self, vec1, vec2, beta, alpha); + TORCH_WARN( + "torch.addr is deprecated and may be removed in a future PyTorch release. " + "This function can be implemented using torch.outer as " + "alpha * torch.outer(vec1, vec2) + beta * input when beta is not zero, " + "alpha * torch.outer(vec1, vec2) when beta is zero."); + + Tensor outer_result = at::outer(vec1, vec2) * alpha; + if (beta.to() == 0.0) { + return outer_result; + } + return outer_result + (self * beta); } Tensor& addr_(Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) { - check_1d(vec1, "vec1", "addr"); - check_1d(vec2, "vec2", "addr"); - return at::_addr_(self, vec1, vec2, beta, alpha); + return at::addr_out(self, self, vec1, vec2, beta, alpha); } Tensor& addr_out(Tensor &result, const Tensor& self, const Tensor& vec1, const Tensor& vec2, Scalar beta, Scalar alpha) { - check_1d(vec1, "vec1", "addr"); - check_1d(vec2, "vec2", "addr"); - Tensor b_self; - std::tie(b_self) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr_out"); - return at::_addr_out(result, b_self, vec1, vec2, beta, alpha); + auto addr_result = at::addr(self, vec1, vec2, beta, alpha); + // Validates safe casting + const auto result_dtype = addr_result.scalar_type(); + TORCH_CHECK(canCast(result_dtype, result.scalar_type()), + "result type ", result_dtype, + " can't be cast to the desired output type ", result.scalar_type()); + + at::native::resize_output(result, addr_result.sizes().vec()); + result.copy_(addr_result); + return result; } +// torch.ger, alias for torch.outer Tensor& ger_out(Tensor &result, const Tensor& self, const Tensor& vec2) { - check_1d(self, "self", "ger"); - check_1d(vec2, "vec2", "ger"); - if (result.dim() != 2 || result.size(0) != self.size(0) || result.size(1) != vec2.size(0)) { - result.resize_({ self.size(0), vec2.size(0) }); - } - // resize_ does the "broadcasting", don't need to broadcast again. - return at::_addr_out(result, result, self, vec2, Scalar(0), Scalar(1)); + TORCH_WARN("torch.ger is deprecated and will be removed in a future PyTorch release. " + "Use torch.outer instead."); + return at::outer_out(result, self, vec2); } Tensor ger(const Tensor& self, const Tensor& vec2) { - Tensor result = at::empty({0}, self.options()); - at::ger_out(result, self, vec2); - return result; + return self.outer(vec2); } -// torch.outer, alias for torch.ger Tensor& outer_out(Tensor &result, const Tensor& self, const Tensor& vec2) { - return at::ger_out(result, self, vec2); + check_1d(self, "self", "outer"); + check_1d(vec2, "vec2", "outer"); + + // torch.outer is implemented as a composite op using reshape and mul + at::mul_out(result, self.reshape({self.size(0), 1}), vec2); + return result; } Tensor outer(const Tensor& self, const Tensor& vec2) { - return self.ger(vec2); + check_1d(self, "self", "outer"); + check_1d(vec2, "vec2", "outer"); + + return self.reshape({self.size(0), 1}) * vec2; } static void addmm_impl_cpu_( @@ -1223,6 +1234,8 @@ Tensor matrix_exp(const Tensor& a) { "matrix_exp(", a.scalar_type(), "{", a.sizes(), "}): expected a tensor " "of squared matrices"); + NoTF32Guard disable_tf32; + if (a.size(-1) == 1) { return a.exp(); } @@ -1231,6 +1244,7 @@ Tensor matrix_exp(const Tensor& a) { } Tensor matrix_exp_backward(const Tensor& self, const Tensor& grad) { + NoTF32Guard disable_tf32; return backward_analytic_function_of_a_matrix( self, grad, [](const Tensor& a) { diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 5c07700f1e85..4a6af18a5a96 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -318,4 +318,19 @@ static inline std::vector create_reverse_permutation(std::vector 10 * mn) { + return 5 * mn * mn + 5 * mn; + } + return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn); +} + }} // namespace at::native diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 8dc5432d8a8c..2a3e97cf5dd8 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -295,24 +295,41 @@ Tensor soft_margin_loss( return output; } -Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction) { +Tensor smooth_l1_loss(const Tensor& input, const Tensor& target, const int64_t reduction, double beta) { + TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.") + if (beta == 0) { + return at::native::l1_loss(input, target, reduction); + } Tensor loss; auto iter = TensorIterator::binary_op(loss, input, target); - smooth_l1_stub(iter.device_type(), iter); + smooth_l1_stub(iter.device_type(), iter, beta); return apply_loss_reduction(iter.output(), reduction); } -Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction) { +Tensor& smooth_l1_loss_out(Tensor& result, const Tensor& input, const Tensor& target, int64_t reduction, double beta) { + TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.") + if (beta == 0) { + return at::native::l1_loss_out(result, input, target, reduction); + } if (reduction != Reduction::None) { - result = at::smooth_l1_loss(input, target, reduction); + Tensor loss; + auto iter = TensorIterator::binary_op(loss, input, target); + smooth_l1_stub(iter.device_type(), iter, beta); + if (reduction == Reduction::Mean) { + at::mean_out(result, iter.output(), 0); + } else { + at::sum_out(result, iter.output(), 0); + } } else { auto iter = TensorIterator::binary_op(result, input, target); - smooth_l1_stub(iter.device_type(), iter); + smooth_l1_stub(iter.device_type(), iter, beta); } return result; } -Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) { +Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) { + if (beta <= 0) + return at::native::l1_loss_backward_out(grad_input, grad_output, input, target, reduction); auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.; auto iter = at::TensorIteratorConfig() .add_output(grad_input) @@ -320,13 +337,15 @@ Tensor& smooth_l1_loss_backward_out(Tensor& grad_input, const Tensor& grad_outpu .add_input(target) .add_input(grad_output) .build(); - smooth_l1_backward_stub(iter.device_type(), iter, norm); + smooth_l1_backward_stub(iter.device_type(), iter, norm, beta); return grad_input; } -Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) { +Tensor smooth_l1_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction, double beta) { + if (beta <= 0) + return at::native::l1_loss_backward(grad_output, input, target, reduction); auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction); + return at::smooth_l1_loss_backward_out(grad_input, grad_output, input, target, reduction, beta); } Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) { diff --git a/aten/src/ATen/native/MaxPooling.cpp b/aten/src/ATen/native/MaxPooling.cpp index a0298ea937de..645822f55065 100644 --- a/aten/src/ATen/native/MaxPooling.cpp +++ b/aten/src/ATen/native/MaxPooling.cpp @@ -97,6 +97,10 @@ Tensor max_pool1d( IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) { + if (self.is_quantized()) { + return at::quantized_max_pool1d(self, kernel_size, stride, padding, + dilation, ceil_mode); + } if (self.requires_grad() || !self.device().is_cpu()) { // Needs indices for grad and with_indices defines CUDA dispatch return std::get<0>(at::max_pool1d_with_indices( diff --git a/aten/src/ATen/native/MetaTensor.cpp b/aten/src/ATen/native/MetaTensor.cpp index 2ae5fb0f9d59..f8f0231b181c 100644 --- a/aten/src/ATen/native/MetaTensor.cpp +++ b/aten/src/ATen/native/MetaTensor.cpp @@ -25,7 +25,7 @@ Tensor empty_meta( // participate in dispatch, but so that tests like is_sparse/is_cuda // give the correct result (a CUDA meta tensor "is cuda"). If we don't // like this, remove the computeDispatchKey line - DispatchKeySet{DispatchKey::Meta, computeDispatchKey(options)}, + DispatchKeySet{DispatchKey::Meta, options.computeDispatchKey()}, dtype, device ); diff --git a/aten/src/ATen/native/PointwiseOps.h b/aten/src/ATen/native/PointwiseOps.h index e81a89454905..98df21121ba3 100644 --- a/aten/src/ATen/native/PointwiseOps.h +++ b/aten/src/ATen/native/PointwiseOps.h @@ -11,10 +11,11 @@ struct TensorIterator; namespace native { using pointwise_fn = void (*)(TensorIterator&, Scalar scalar); +using pointwise_fn_beta = void (*)(TensorIterator&, Scalar scalar, double beta); DECLARE_DISPATCH(pointwise_fn, addcmul_stub); DECLARE_DISPATCH(pointwise_fn, addcdiv_stub); -DECLARE_DISPATCH(pointwise_fn, smooth_l1_backward_stub); +DECLARE_DISPATCH(pointwise_fn_beta, smooth_l1_backward_stub); DECLARE_DISPATCH(pointwise_fn, mse_backward_stub); } // namespace native diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp index 414c8a6f6390..c10a617a5928 100644 --- a/aten/src/ATen/native/Pow.cpp +++ b/aten/src/ATen/native/Pow.cpp @@ -43,7 +43,9 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) { } Tensor& pow_out(Tensor& result, Scalar base, const Tensor& exp) { - if (base.toDouble() == 1.0) { + if (base.isComplex() && base.toComplexDouble() == 1.0) { + result.resize_as_(exp).fill_(1); + } else if (!base.isComplex() && base.toDouble() == 1.0) { result.resize_as_(exp).fill_(1); } else { native::pow_out(result, c10::scalar_to_tensor(base, exp.device()), exp); diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 861901521a3b..f18114e73246 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -277,6 +277,18 @@ them the same thing!) If two backends have the same dispatch function, you can write `CPU, CUDA: func` to reuse the same function name in both cases. +Available backend options can be found at +https://github.com/pytorch/pytorch/blob/master/tools/codegen/gen.py#L970. +In addition to backends above, we also support keyword `Math` which is an alias +that maps to all backend and autograd backend keys. In other words, function registered to `Math` key +should be a plain mathematical composition of other `at::` functions and works for any backend. + +If you add `dispatch` section to any API that didn't have it before, you **have to** move +the old implementation to `Math` field so that it's still available for other backends to use. + +This work is currently WIP and you can find the design proposal in +https://github.com/pytorch/pytorch/issues/44680. + ### `device_guard` ``` diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index ffddddfd2ba5..7394365903ed 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -274,33 +274,6 @@ std::tuple kthvalue_out_impl_cpu( } // namespace -std::tuple kthvalue_out_cpu( - Tensor& values, - Tensor& indices, - const Tensor& self, - int64_t k, - int64_t dim, - bool keepdim) { - auto result = [&]() { - NoNamesGuard guard; - return kthvalue_out_impl_cpu(values, indices, self, k, dim, keepdim); - }(); - namedinference::propagate_names_for_reduction(values, self, dim, keepdim); - namedinference::propagate_names_for_reduction(indices, self, dim, keepdim); - return result; -} - -std::tuple kthvalue( - const Tensor& self, - int64_t k, - int64_t dim, - bool keepdim) { - Tensor values = at::empty({0}, self.options()); - Tensor indices = at::empty({0}, self.options().dtype(kLong)); - at::kthvalue_out(values, indices, self, k, dim, keepdim); - return std::make_tuple(values, indices); -} - Tensor& quantile_out( Tensor& out, const Tensor& self, @@ -395,6 +368,52 @@ Tensor nanquantile( self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim); } +std::tuple kthvalue_out_cpu( + Tensor& values, + Tensor& indices, + const Tensor& self, + int64_t k, + int64_t dim, + bool keepdim) { + auto result = [&]() { + NoNamesGuard guard; + return kthvalue_out_impl_cpu(values, indices, self, k, dim, keepdim); + }(); + namedinference::propagate_names_for_reduction(values, self, dim, keepdim); + namedinference::propagate_names_for_reduction(indices, self, dim, keepdim); + return result; +} + +std::tuple kthvalue_out( + Tensor& values, + Tensor& indices, + const Tensor& self, + int64_t k, + Dimname dim, + bool keepdim) { + return at::kthvalue_out( + values, indices, self, k, dimname_to_position(self, dim), keepdim); +} + +std::tuple kthvalue( + const Tensor& self, + int64_t k, + int64_t dim, + bool keepdim) { + Tensor values = at::empty({0}, self.options()); + Tensor indices = at::empty({0}, self.options().dtype(kLong)); + at::kthvalue_out(values, indices, self, k, dim, keepdim); + return std::make_tuple(values, indices); +} + +std::tuple kthvalue( + const Tensor& self, + int64_t k, + Dimname dim, + bool keepdim) { + return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim); +} + std::tuple topk_out_cpu( Tensor& values, Tensor& indices, @@ -432,6 +451,33 @@ std::tuple topk( return std::make_tuple(values, indices); } +// this does not reduce to median with dim because we don't want to copy twice +Tensor median_cpu(const Tensor& self) { + NoNamesGuard guard; + TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor"); + if (self.dim() == 0 && self.numel() == 1) { + return self.clone(at::MemoryFormat::Contiguous); + } + auto tmp_values = self.clone(at::MemoryFormat::Contiguous).view(-1); + auto result = at::empty({1}, self.options()); + AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] { + // note, quick_select is 0 based while kthvalue is not + int64_t k = (tmp_values.size(0) - 1) / 2; + auto val_accessor = tmp_values.accessor(); + quick_select_template( + val_accessor, + k, + [](scalar_t x, scalar_t y) -> bool { + return ((_isnan(x) && !_isnan(y)) || (x > y)); + }, + [&](int64_t i, int64_t j) { + std::swap(val_accessor[i], val_accessor[j]); + }); + result.fill_(tmp_values[k]); + }); + return result.view({}); +} + std::tuple median_out( Tensor& values, Tensor& indices, @@ -444,16 +490,6 @@ std::tuple median_out( return std::forward_as_tuple(values, indices); } -std::tuple median( - const Tensor& self, - int64_t dim, - bool keepdim) { - Tensor values = at::empty({0}, self.options()); - Tensor indices = at::empty({0}, self.options().dtype(kLong)); - at::median_out(values, indices, self, dim, keepdim); - return std::make_tuple(values, indices); -} - std::tuple median_out( Tensor& values, Tensor& indices, @@ -466,55 +502,19 @@ std::tuple median_out( std::tuple median( const Tensor& self, - Dimname dim, - bool keepdim) { - return at::median(self, dimname_to_position(self, dim), keepdim); -} - -std::tuple kthvalue_out( - Tensor& values, - Tensor& indices, - const Tensor& self, - int64_t k, - Dimname dim, + int64_t dim, bool keepdim) { - return at::kthvalue_out( - values, indices, self, k, dimname_to_position(self, dim), keepdim); + Tensor values = at::empty({0}, self.options()); + Tensor indices = at::empty({0}, self.options().dtype(kLong)); + at::median_out(values, indices, self, dim, keepdim); + return std::make_tuple(values, indices); } -std::tuple kthvalue( +std::tuple median( const Tensor& self, - int64_t k, Dimname dim, bool keepdim) { - return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim); -} - -// this does not reduce to median with dim because we don't want to copy twice -Tensor median_cpu(const Tensor& self) { - NoNamesGuard guard; - TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor"); - if (self.dim() == 0 && self.numel() == 1) { - return self.clone(at::MemoryFormat::Contiguous); - } - auto tmp_values = self.clone(at::MemoryFormat::Contiguous).view(-1); - auto result = at::empty({1}, self.options()); - AT_DISPATCH_ALL_TYPES(self.scalar_type(), "median", [&] { - // note, quick_select is 0 based while kthvalue is not - int64_t k = (tmp_values.size(0) - 1) / 2; - auto val_accessor = tmp_values.accessor(); - quick_select_template( - val_accessor, - k, - [](scalar_t x, scalar_t y) -> bool { - return ((_isnan(x) && !_isnan(y)) || (x > y)); - }, - [&](int64_t i, int64_t j) { - std::swap(val_accessor[i], val_accessor[j]); - }); - result.fill_(tmp_values[k]); - }); - return result.view({}); + return at::median(self, dimname_to_position(self, dim), keepdim); } std::tuple sort_out_cpu( diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index e467c21a4a30..21e4d63b163b 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -203,19 +203,129 @@ Tensor fft_c2c(Tensor input, c10::optional n_opt, return out; } +// Dimensions to transform, and the signal shape in those dimensions +struct ShapeAndDims { + DimVector shape, dim; +}; + +// Pre-process n-dimensional fft's `s` and `dim` arguments. +// Wraps dimensions and applies defaulting behavior. +// Also checks transform dims are unique and transform shape is non-empty. +ShapeAndDims canonicalize_fft_shape_and_dim_args( + Tensor input, c10::optional shape, c10::optional dim) { + const int64_t input_dim = input.dim(); + const IntArrayRef input_sizes = input.sizes(); + ShapeAndDims ret; + + if (dim) { + ret.dim.resize(dim->size()); + std::copy(dim->begin(), dim->end(), ret.dim.begin()); + maybe_wrap_dims(ret.dim, input_dim); + + // Check dims are unique + DimVector copy = ret.dim; + std::sort(copy.begin(), copy.end()); + auto duplicate = std::adjacent_find(copy.begin(), copy.end()); + TORCH_CHECK(duplicate == copy.end(), "FFT dims must be unique"); + } + + if (shape) { + // Has shape, may have dim + TORCH_CHECK(!dim || dim->size() == shape->size(), + "When given, dim and shape arguments must have the same length"); + TORCH_CHECK(shape->size() <= input_dim, + "Got shape with ", shape->size(), " values but input tensor " + "only has ", input_dim, " dimensions."); + const int64_t transform_ndim = shape->size(); + // If shape is given, dims defaults to the last shape.size() dimensions + if (!dim) { + ret.dim.resize(transform_ndim); + std::iota(ret.dim.begin(), ret.dim.end(), input_dim - transform_ndim); + } + + // Translate shape of -1 to the default length + ret.shape.resize(transform_ndim); + for (int64_t i = 0; i < transform_ndim; ++i) { + const auto n = (*shape)[i]; + ret.shape[i] = n == -1 ? input_sizes[ret.dim[i]] : n; + } + } else if (!dim) { + // No shape, no dim + ret.dim.resize(input_dim); + std::iota(ret.dim.begin(), ret.dim.end(), int64_t{0}); + ret.shape.resize(input_dim); + std::copy(input_sizes.begin(), input_sizes.end(), ret.shape.begin()); + } else { + // No shape, has dim + ret.shape.resize(ret.dim.size()); + for (int64_t i = 0; i < ret.dim.size(); ++i) { + ret.shape[i] = input_sizes[ret.dim[i]]; + } + } + + for (int64_t i = 0; i < ret.shape.size(); ++i) { + TORCH_CHECK(ret.shape[i] > 0, + "Invalid number of data points (", ret.shape[i], ") specified"); + } + + return ret; +} + +// Complex to complex n-dimensional fft +Tensor fftn_c2c( + const Tensor& input, IntArrayRef shape, IntArrayRef dim, + c10::optional norm_str, bool forward) { + TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); + const auto input_dim = input.dim(); + + Tensor x = resize_fft_input(input, dim, shape); + x = at::view_as_real(x); + + const int64_t transform_ndim = dim.size(); + const auto norm = norm_from_string(norm_str, forward); + // _fft_with_size only supports 3 dimensions being transformed at a time. + // This limit is inherited from cuFFT. + constexpr int64_t max_signal_ndim = 3; + + // Transform n dimensions, up to 3 at a time + // TODO: rewrite _fft_with_size to transform more than 3 dimensions at once. + for (int64_t i = 0; i < transform_ndim; i += max_signal_ndim) { + const int64_t signal_ndim = std::min(transform_ndim - i, max_signal_ndim); + DimVector source_dim(signal_ndim); + DimVector dest_dim(signal_ndim); + + for (int64_t j = 0; j < signal_ndim; ++j) { + source_dim[j] = dim[i + j]; + dest_dim[j] = j + (input_dim - signal_ndim); + } + + // _fft operates on up-to the last 3 dims, so move selected dims to the end + x = at::movedim(x, source_dim, dest_dim); + + x = _fft(x, signal_ndim, /*complex_input=*/true, /*complex_output=*/true, + /*inverse=*/!forward, /*signal_sizes=*/{}, /*normalization=*/norm, + /*onesided=*/false); + + // Move transform dims back to their original order + x = at::movedim(x, dest_dim, source_dim); + } + + return at::view_as_complex(x); +} + } // torch.fft.fft, analogous to NumPy's numpy.fft.fft Tensor fft_fft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return self.is_complex() ? + return self.is_complex() ? fft_c2c(self, n, dim, norm, /*forward=*/true) : fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false); } Tensor fft_ifft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return self.is_complex() ? + return self.is_complex() ? fft_c2c(self, n, dim, norm, /*forward=*/false) : fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false); } @@ -240,6 +350,128 @@ Tensor fft_ihfft(const Tensor& self, c10::optional n, int64_t dim, return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true); } +Tensor fft_fftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + // TODO: For real input, perform rfftn then mirror with conjugate symmetry + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true); +} + +Tensor fft_ifftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false); +} + +Tensor fft_rfftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis"); + + const auto last_dim = desc.dim.back(); + const auto last_shape = desc.shape.back(); + desc.shape.pop_back(); + desc.dim.pop_back(); + + // rfft on last dim to get hermitian complex shape + auto x = native::fft_rfft(self, last_shape, last_dim, norm); + // Normal fft on remaining dims + return fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/true); +} + +Tensor fft_irfftn(const Tensor& self, c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis"); + + const auto last_dim = desc.dim.back(); + const auto last_shape = [&]() -> c10::optional { + // If shape is defaulted in the last dimension, + // pass nullopt to irfft and let it calculate the default size + if (!s.has_value() || (s->back() == -1)) { + return c10::nullopt; + } + return desc.shape.back(); + }(); + desc.shape.pop_back(); + desc.dim.pop_back(); + + // Normal ifft for all but last dim + Tensor x = promote_tensor_fft(self, /*require_complex=*/true); + x = fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/false); + // Then 1d irfft on last dim to get real output + return native::fft_irfft(x, last_shape, last_dim, norm); +} + +Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) { + ScalarType dtype = typeMetaToScalarType(options.dtype()); + TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), + "fftfreq requires a floating point or complex dtype"); + // TODO: arange doesn't have complex support + Tensor result = native::arange(n, options); + auto right_slice = result.slice(0, (n + 1) / 2, 0); + at::arange_out(right_slice, -(n/2), 0, 1); + result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) + return result; +} + +Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) { + ScalarType dtype = typeMetaToScalarType(options.dtype()); + TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), + "rfftfreq requires a floating point or complex dtype"); + // TODO: arange doesn't have complex support + Tensor result = native::arange(n/2 + 1, options); + result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) + return result; +} + +// If an array dim is specified, wraps them according to self.dim(). +// Otherwise returns a vector of all dims. +DimVector default_alldims(const Tensor& self, c10::optional dim_opt) { + DimVector dim; + if (dim_opt) { + IntArrayRef dim_unwrapped = *dim_opt; + dim.resize(dim_unwrapped.size()); + for (int64_t i = 0; i < dim.size(); ++i) { + dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim()); + } + } else { + dim.resize(self.dim()); + std::iota(dim.begin(), dim.end(), 0); + } + return dim; +} + +Tensor fft_fftshift(const Tensor& x, c10::optional dim_opt) { + auto dim = default_alldims(x, dim_opt); + + IntArrayRef x_sizes = x.sizes(); + DimVector shift(dim.size()); + for (int64_t i = 0; i < dim.size(); ++i) { + shift[i] = x_sizes[dim[i]] / 2; + } + + return at::roll(x, shift, dim); +} + +Tensor fft_ifftshift(const Tensor& x, c10::optional dim_opt) { + auto dim = default_alldims(x, dim_opt); + + IntArrayRef x_sizes = x.sizes(); + DimVector shift(dim.size()); + for (int64_t i = 0; i < dim.size(); ++i) { + shift[i] = (x_sizes[dim[i]] + 1) / 2; + } + + return at::roll(x, shift, dim); +} + // This is a pass-through wrapper function that does the size check and // inferences. The actual forward implementation function is called @@ -393,6 +625,10 @@ void _cufft_clear_plan_cache(int64_t device_index) { } Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { + TORCH_WARN_ONCE( + "The function torch.fft is deprecated and will be removed in PyTorch 1.8. " + "Use the new torch.fft module functions, instead, by importing torch.fft " + "and calling torch.fft.fft or torch.fft.fftn."); return _fft(self, signal_ndim, /* complex_input */ true, /* complex_output */ true, /* inverse */ false, {}, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none, @@ -400,6 +636,10 @@ Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) } Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { + TORCH_WARN_ONCE( + "The function torch.ifft is deprecated and will be removed in a future " + "PyTorch release. Use the new torch.fft module functions, instead, by " + "importing torch.fft and calling torch.fft.ifft or torch.fft.ifftn."); return _fft(self, signal_ndim, /* complex_input */ true, /* complex_output */ true, /* inverse */ true, {}, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n, @@ -408,6 +648,10 @@ Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, const bool onesided) { + TORCH_WARN_ONCE( + "The function torch.rfft is deprecated and will be removed in a future " + "PyTorch release. Use the new torch.fft module functions, instead, by " + "importing torch.fft and calling torch.fft.fft or torch.fft.rfft."); return _fft(self, signal_ndim, /* complex_input */ false, /* complex_output */ true, /* inverse */ false, {}, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none, @@ -416,6 +660,10 @@ Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, const bool onesided, IntArrayRef signal_sizes) { + TORCH_WARN_ONCE( + "The function torch.irfft is deprecated and will be removed in a future " + "PyTorch release. Use the new torch.fft module functions, instead, by " + "importing torch.fft and calling torch.fft.ifft or torch.fft.irfft."); return _fft(self, signal_ndim, /* complex_input */ true, /* complex_output */ false, /* inverse */ true, signal_sizes, normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n, @@ -463,8 +711,10 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop const bool return_complex = return_complexOpt.value_or( self.is_complex() || (window.defined() && window.is_complex())); if (!return_complexOpt && !return_complex) { - TORCH_WARN("stft will return complex tensors by default in future, use" - " return_complex=False to preserve the current output format."); + TORCH_WARN_ONCE("stft will require the return_complex parameter be explicitly " + " specified in a future PyTorch release. Use return_complex=False " + " to preserve the current behavior or return_complex=True to return " + " a complex output."); } if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) { diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index ad6625308ff5..bc58ba8e6eec 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -135,6 +135,26 @@ static Tensor reshape_indexer(const Tensor& index, int64_t dims_before, int64_t return index.reshape(shape); } +static ptrdiff_t dataOffset(const Tensor& tensor, ptrdiff_t linearIndex) { + auto size = tensor.sizes(); + auto stride = tensor.strides(); + int nDim = tensor.dim(); + ptrdiff_t dataOffset = 0; + for (int i = nDim - 1; i >= 0; i--) { + dataOffset += (linearIndex % size[i]) * stride[i]; + linearIndex /= size[i]; + } + return dataOffset; +} + +static inline int64_t wrapLinearIndex(int64_t linearIndex, int64_t numel) { + return linearIndex < 0 ? linearIndex + numel : linearIndex; +} + +static inline void checkLinearIndex(int64_t linearIndex, int64_t numel) { + TORCH_CHECK(linearIndex < numel && linearIndex >= -numel, "out of range: ", linearIndex, " out of ", numel); +} + AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list) { int64_t element_size_bytes = src.element_size(); @@ -815,6 +835,77 @@ Tensor masked_select_backward(const Tensor& grad, const Tensor& input, const Ten return result.masked_scatter_(mask, grad); } +void take_out_cpu_template( + Tensor& output, + Tensor const& input, + Tensor const& index) +{ + TORCH_CHECK(output.device().type() == at::kCPU, "device type of output (", output.device().type(), ") is not CPU"); + TORCH_CHECK(input.device().type() == at::kCPU, "device type of input (", input.device().type(), ") is not CPU"); + TORCH_CHECK(index.device().type() == at::kCPU, "device type of index (", index.device().type(), ") is not CPU"); + + TORCH_CHECK(output.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", + output.layout(), " on output tensor"); + TORCH_CHECK(input.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", + input.layout(), " on input tensor"); + TORCH_CHECK(index.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", + index.layout(), " on index tensor"); + + TORCH_CHECK(output.scalar_type() == input.scalar_type(), "output and input scalar type must match.", + "But got different types: ", output.scalar_type(), " and ", input.scalar_type()); + TORCH_CHECK(index.scalar_type() == kLong, "index must be an int64 tensor"); + + output.resize_(index.sizes()); + auto output_contiguous = output.contiguous(); + auto index_continuous = index.contiguous(); + bool is_contiguous = input.is_contiguous(); + auto input_size = input.numel(); + + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cpu", [&] { + auto output_data = output_contiguous.data_ptr(); + auto input_data = input.data_ptr(); + auto index_data = index.data_ptr(); + + // Exceptions must not be thrown across parallel sections, so we + // record the position of the invalid index and throw the exception after the + // loop. + std::atomic invalidIdxPos(-1); + + at::parallel_for(0, index.numel(), at::internal::GRAIN_SIZE, + [&](int64_t start, int64_t end) { + for (auto i = start; i < end; i++) { + int64_t idx = index_data[i]; + if (idx < input_size && idx >= -input_size) { + idx = wrapLinearIndex(idx, input_size); + if (is_contiguous) { + output_data[i] = input_data[idx]; + } else { + output_data[i] = input_data[dataOffset(input, idx)]; + } + } else { + int64_t tmp = -1; + invalidIdxPos.compare_exchange_strong(tmp, i); + } + } + }); + + if (invalidIdxPos >= 0) { + checkLinearIndex(index_data[invalidIdxPos], input_size); + } + }); +} + +Tensor take_cpu(const Tensor& self, const Tensor& index) { + auto output = at::empty(index.sizes(), self.options()); + take_out_cpu_template(output, self, index); + return output; +} + +Tensor& take_out_cpu(Tensor& out, const Tensor& self, const Tensor& index) { + take_out_cpu_template(out, self, index); + return out; +} + Tensor take_backward(const Tensor& grad, const Tensor& input, const Tensor& index) { return at::zeros_like(input).put_(index, grad, true); } diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 2764490f6d48..e2b5639f8dc9 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -162,7 +162,10 @@ Tensor& abs_out(Tensor& result, const Tensor& self) { Tensor abs(const Tensor& self) { return unary_op_impl_with_complex_to_float(self, at::abs_out); } -Tensor& abs_(Tensor& self) { return unary_op_impl_(self, at::abs_out); } +Tensor& abs_(Tensor& self) { + TORCH_CHECK(!self.is_complex(), "In-place abs is not supported for complex tensors."); + return unary_op_impl_(self, at::abs_out); +} // Absolute, alias for abs Tensor& absolute_out(Tensor& result, const Tensor& self) { @@ -301,6 +304,17 @@ Tensor& sign_out(Tensor& result, const Tensor& self) { return unary_op_impl_out( Tensor sign(const Tensor& self) { return unary_op_impl(self, at::sign_out); } Tensor& sign_(Tensor& self) { return unary_op_impl_(self, at::sign_out); } +Tensor& sgn_out(Tensor& result, const Tensor& self) { + if (self.is_complex()) { + return unary_op_impl_out(result, self, sgn_stub); + } else { + return unary_op_impl_out(result, self, sign_stub); + } +} + +Tensor sgn(const Tensor& self) { return unary_op_impl(self, at::sgn_out); } +Tensor& sgn_(Tensor& self) { return unary_op_impl_(self, at::sgn_out); } + Tensor& sin_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sin_stub); } Tensor sin(const Tensor& self) { return unary_op_impl(self, at::sin_out); } Tensor& sin_(Tensor& self) { return unary_op_impl_(self, at::sin_out); } @@ -373,6 +387,41 @@ Tensor& logit_(Tensor& self, c10::optional eps) { return at::logit_out(self, self, eps); } +Tensor& nan_to_num_out( + Tensor& result, + const Tensor& self, + c10::optional nan, + c10::optional pos_inf, + c10::optional neg_inf) { + + if (c10::isIntegralType(self.scalar_type())) { + result.resize_as_(self); + result.copy_(self); + return result; + } + + auto iter = TensorIterator::unary_op(result, self); + nan_to_num_stub(iter.device_type(), iter, nan, pos_inf, neg_inf); + return result; +} + +Tensor nan_to_num( + const Tensor& self, + c10::optional nan, + c10::optional pos_inf, + c10::optional neg_inf) { + auto result = at::empty_like(self); + return at::nan_to_num_out(result, self, nan, pos_inf, neg_inf); +} + +Tensor& nan_to_num_( + Tensor& self, + c10::optional nan, + c10::optional pos_inf, + c10::optional neg_inf) { + return at::nan_to_num_out(self, self, nan, pos_inf, neg_inf); +} + Tensor& tanh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, tanh_stub); } Tensor tanh(const Tensor& self) { return unary_op_impl(self, at::tanh_out); } Tensor& tanh_(Tensor& self) { return unary_op_impl_(self, at::tanh_out); } @@ -405,9 +454,9 @@ Tensor& neg_out(Tensor& result, const Tensor& self) { Tensor neg(const Tensor& self) { return unary_op_impl(self, at::neg_out); } Tensor& neg_(Tensor& self) { return unary_op_impl_(self, at::neg_out); } -Tensor& negative_out(Tensor& result, const Tensor& self) { return at::native::neg_out(result, self); } -Tensor negative(const Tensor& self) { return at::native::neg(self); } -Tensor& negative_(Tensor& self) { return at::native::neg_(self); } +Tensor& negative_out(Tensor& result, const Tensor& self) { return at::neg_out(result, self); } +Tensor negative(const Tensor& self) { return self.neg(); } +Tensor& negative_(Tensor& self) { return self.neg_(); } Tensor logical_not(const Tensor& self) { Tensor result = at::empty({0}, self.options().dtype(kBool)); @@ -631,6 +680,7 @@ DEFINE_DISPATCH(log1p_stub); DEFINE_DISPATCH(log2_stub); DEFINE_DISPATCH(logical_not_stub); DEFINE_DISPATCH(neg_stub); +DEFINE_DISPATCH(nan_to_num_stub); DEFINE_DISPATCH(polygamma_stub); DEFINE_DISPATCH(reciprocal_stub); DEFINE_DISPATCH(round_stub); @@ -639,6 +689,7 @@ DEFINE_DISPATCH(sigmoid_stub); DEFINE_DISPATCH(logit_stub); DEFINE_DISPATCH(sign_stub); DEFINE_DISPATCH(signbit_stub); +DEFINE_DISPATCH(sgn_stub); DEFINE_DISPATCH(sin_stub); DEFINE_DISPATCH(sinh_stub); DEFINE_DISPATCH(sqrt_stub); diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index fa172cb58b38..a6db47f17153 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -53,6 +53,7 @@ DECLARE_DISPATCH(unary_fn, sigmoid_stub); DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub); DECLARE_DISPATCH(unary_fn, sign_stub); DECLARE_DISPATCH(unary_fn, signbit_stub); +DECLARE_DISPATCH(unary_fn, sgn_stub); DECLARE_DISPATCH(unary_fn, sin_stub); DECLARE_DISPATCH(unary_fn, sinh_stub); DECLARE_DISPATCH(unary_fn, sqrt_stub); @@ -76,6 +77,13 @@ DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional), random_stub DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t), polygamma_stub); DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar a, Scalar b), clamp_stub); DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, int64_t, bool, c10::optional), multinomial_stub); +DECLARE_DISPATCH( + void (*)( + TensorIterator&, + c10::optional, + c10::optional, + c10::optional), + nan_to_num_stub); // Missing unary functions // digamma diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp index d06c27f69e3d..7d5cea725cf1 100644 --- a/aten/src/ATen/native/VariableMethodStubs.cpp +++ b/aten/src/ATen/native/VariableMethodStubs.cpp @@ -12,7 +12,7 @@ void backward(const Tensor& self, const Tensor& gradient, c10::optional ke AT_ERROR("backward is not implemented for Tensor"); } -void set_data(const Tensor& self, const Tensor& new_data) { +void set_data(Tensor& self, const Tensor& new_data) { AT_ERROR("set_data is not implemented for Tensor"); } @@ -36,7 +36,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) { AT_ERROR("requires_grad_ is not implemented for Tensor"); } -void retain_grad(const Tensor& self) { +void retain_grad(Tensor& self) { AT_ERROR("retain_grad is not implemented for Tensor"); } diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 09847a010ee3..fce8c348919b 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -237,14 +237,14 @@ void logical_and_kernel(TensorIterator& iter) { // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because // common_dtype() is unavailable for bfloat16. if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_and_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a && b; }); }); } else { - AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_and_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { return static_cast(a && b); @@ -257,14 +257,14 @@ void logical_or_kernel(TensorIterator& iter) { // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because // common_dtype() is unavailable for bfloat16. if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_or_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return a || b; }); }); } else { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.dtype(), "logical_or_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { return static_cast(a || b); @@ -277,14 +277,14 @@ void logical_xor_kernel(TensorIterator& iter) { // We use if-else here specifically for bool instead of using iter.common_dtype() like the CUDA implementation because // common_dtype() is unavailable for bfloat16. if (iter.dtype() == ScalarType::Bool) { - AT_DISPATCH_ALL_TYPES_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "logical_xor_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> bool { return bool(a) != bool(b); }); }); } else { - AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, iter.dtype(), "logical_xor_cpu", [&]() { cpu_kernel(iter, [](scalar_t a, scalar_t b) -> scalar_t { return static_cast(bool(a) != bool(b)); @@ -502,24 +502,25 @@ void minimum_kernel(TensorIterator& iter) { } } -void smooth_l1_kernel(TensorIterator& iter) { +void smooth_l1_kernel(TensorIterator& iter, double beta) { AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, iter.dtype(), "smooth_l1_cpu", [&]() { using Vec = Vec256; - const Vec one_vec(static_cast(1)); + const scalar_t beta_val(beta); + const Vec beta_val_vec(beta_val); const Vec point_five_vec(static_cast(0.5)); cpu_kernel_vec( iter, - [](scalar_t a, scalar_t b) -> scalar_t { + [&beta_val](scalar_t a, scalar_t b) -> scalar_t { auto z = std::abs(a - b); - return z < static_cast(1) - ? static_cast(0.5) * z * z - : z - static_cast(0.5); + return z < beta_val + ? static_cast(0.5) * z * z / beta_val + : z - static_cast(0.5) * beta_val; }, - [&one_vec, &point_five_vec](Vec a, Vec b) { + [&beta_val_vec, &point_five_vec](Vec a, Vec b) { auto z = (a - b).abs(); return Vec::blendv( - point_five_vec * z * z, z - point_five_vec, z >= one_vec); + point_five_vec * z * z / beta_val_vec, z - point_five_vec * beta_val_vec, z >= beta_val_vec); }); }); } diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index 114ca93dae26..34911a2975e4 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -104,7 +104,11 @@ struct Dist { // Special general pnorm derivative if p is less than two struct lttdist_calc { - static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); } + static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { + Vec result = (dist == 0.0) ? Vec(0) : (sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1))); + result = Vec::blendv(result, Vec(0), (diff == Vec(0)) & (p < Vec(1))); + return result; + } }; // Two norm diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp index 45c803e0fec2..4a52178972fc 100644 --- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp @@ -46,28 +46,39 @@ static void addcdiv_cpu_kernel(TensorIterator& iter, Scalar value) { }); } -static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm) { +static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, Scalar norm, double beta) { ScalarType dtype = iter.dtype(0); AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] { auto norm_val = norm.to(); + scalar_t beta_val(beta); auto norm_val_vec = Vec256(norm_val); + auto beta_val_vec = Vec256(beta_val); const auto neg_1_vec = Vec256(-1); + const auto zero_vec = Vec256(0); const auto pos_1_vec = Vec256(1); cpu_kernel_vec(iter, [=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t { const auto x = input - target; - if (x < -1.) + if (x <= -beta) return -norm_val * grad_output; - else if (x > 1.) + else if (x >= beta) return norm_val * grad_output; else - return norm_val * x * grad_output; + return norm_val * x * grad_output / beta; }, - [norm_val_vec, neg_1_vec, pos_1_vec]( + [norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec]( Vec256 input, Vec256 target, Vec256 grad_output) -> Vec256 { - auto x = input - target; - x = clamp(x, neg_1_vec, pos_1_vec); - return norm_val_vec * x * grad_output; + // using two blendv calls to simulate the 3 cases + // 1 if x >= beta + // -1 if x <= -beta + // x / beta if |x| < beta + const auto x = input - target; + const auto pos_or_neg_1_vec = Vec256::blendv( + neg_1_vec, pos_1_vec, x > zero_vec); + const auto x_abs = x.abs(); + const auto output = Vec256::blendv( + x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec); + return norm_val_vec * output * grad_output; } ); }); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index adf300522692..84c3ceed3a23 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -213,11 +213,14 @@ static void bitwise_not_kernel(TensorIterator& iter) { }); } else { AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_not_cpu", [&]() { - cpu_kernel( + cpu_kernel_vec( iter, [](scalar_t a) -> scalar_t { return ~a; - }); + }, + [](Vec256 a) -> Vec256 { + return ~a; + }); }); } } @@ -235,9 +238,9 @@ static void logical_not_kernel(TensorIterator& iter) { // NOTE: this implementation differs from the CUDA implementation which only does single dispatch // (to avoid expensive compilation) because CPU kernels don't handle dynamic_casting // (see needs_dynamic_casting). - AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cpu", [&]() { using self_t = scalar_t; - AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cpu", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cpu", [&]() { cpu_kernel(iter, [](self_t a) -> scalar_t { return static_cast(!a); }); }); }); @@ -270,16 +273,16 @@ static void sign_kernel(TensorIterator& iter){ auto one_vec = Vec256(static_cast(1)); cpu_kernel_vec( - iter, - [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); }, - [=](Vec256 self_vec){ + iter, + [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); }, + [=](Vec256 self_vec){ - // Comparision operators returns bitmask. - auto left = Vec256::blendv(zero_vec, one_vec, zero_vec < self_vec); - auto right = Vec256::blendv(zero_vec, one_vec, self_vec < zero_vec); + // Comparision operators returns bitmask. + auto left = Vec256::blendv(zero_vec, one_vec, zero_vec < self_vec); + auto right = Vec256::blendv(zero_vec, one_vec, self_vec < zero_vec); - return left - right; - }); + return left - right; + }); }); } } @@ -290,6 +293,15 @@ static void signbit_kernel(TensorIterator& iter){ }); } +static void sgn_kernel(TensorIterator& iter){ + AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), 'sgn_cpu', [&]() { + cpu_kernel_vec( + iter, + [=](scalar_t a) -> scalar_t { return sgn_impl(a); }, + [=](Vec256 a) { return a.sgn(); }); + }); +} + static void sinh_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "sinh_cpu", [&]() { cpu_kernel_vec( @@ -371,6 +383,33 @@ static void polygamma_kernel(TensorIterator& iter, int64_t n) { } } +static void nan_to_num_kernel( + TensorIterator& iter, + c10::optional nan, + c10::optional pos_inf, + c10::optional neg_inf) { + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "nan_to_num", [&]() { + scalar_t nan_replacement = static_cast(nan.value_or(0.)); + scalar_t pos_inf_replacement = pos_inf.has_value() + ? static_cast(pos_inf.value()) + : std::numeric_limits::max(); + scalar_t neg_inf_replacement = neg_inf.has_value() + ? static_cast(neg_inf.value()) + : std::numeric_limits::lowest(); + + cpu_kernel(iter, [=](scalar_t a) -> scalar_t { + return ( + at::_isnan(a) + ? nan_replacement + : (a == std::numeric_limits::infinity() + ? pos_inf_replacement + : (a == -std::numeric_limits::infinity() + ? neg_inf_replacement + : a))); + }); + }); +} + static void clamp_kernel(TensorIterator& iter, Scalar min_scalar, Scalar max_scalar) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, iter.dtype(), "clamp_cpu", [&]() { c10::scalar_value_type::type (*zabs_)(scalar_t) = zabs; @@ -636,9 +675,11 @@ REGISTER_DISPATCH(bitwise_not_stub, &bitwise_not_kernel); REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel); REGISTER_DISPATCH(frac_stub, &frac_kernel); REGISTER_DISPATCH(reciprocal_stub, &reciprocal_kernel); +REGISTER_DISPATCH(nan_to_num_stub, &nan_to_num_kernel); REGISTER_DISPATCH(neg_stub, &neg_kernel); REGISTER_DISPATCH(sign_stub, &sign_kernel); REGISTER_DISPATCH(signbit_stub, &signbit_kernel); +REGISTER_DISPATCH(sgn_stub, &sgn_kernel); REGISTER_DISPATCH(sinh_stub, &sinh_kernel); REGISTER_DISPATCH(cosh_stub, &cosh_kernel); REGISTER_DISPATCH(acosh_stub, &acosh_kernel); @@ -669,7 +710,7 @@ IMPLEMENT_COMPLEX_KERNEL(FLOATING, log10) IMPLEMENT_FLOAT_KERNEL(FLOATING, log1p) IMPLEMENT_COMPLEX_KERNEL(FLOATING, log2) IMPLEMENT_FLOAT_KERNEL(FLOATING, i0) -IMPLEMENT_COMPLEX_KERNEL(FLOATING, round) +IMPLEMENT_FLOAT_KERNEL(FLOATING, round) IMPLEMENT_COMPLEX_KERNEL(FLOATING, sin) IMPLEMENT_COMPLEX_KERNEL(FLOATING, sqrt) IMPLEMENT_COMPLEX_KERNEL(FLOATING, tan) diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h index d6816f4dd182..e0554e0cbc29 100644 --- a/aten/src/ATen/native/cpu/zmath.h +++ b/aten/src/ATen/native/cpu/zmath.h @@ -138,6 +138,15 @@ inline c10::complex ceil_impl (c10::complex z) { return c10::complex(std::ceil(z.real()), std::ceil(z.imag())); } +template +inline c10::complex sgn_impl (c10::complex z) { + if (z == c10::complex(0, 0)) { + return c10::complex(0, 0); + } else { + return z / zabs(z); + } +} + template inline TYPE floor_impl (TYPE z) { return std::floor(z); diff --git a/aten/src/ATen/native/cuda/AbsKernel.cu b/aten/src/ATen/native/cuda/AbsKernel.cu index 4113115d7b12..649b235bf654 100644 --- a/aten/src/ATen/native/cuda/AbsKernel.cu +++ b/aten/src/ATen/native/cuda/AbsKernel.cu @@ -6,11 +6,16 @@ namespace at { namespace native { +template +struct AbsFunctor { + __device__ __forceinline__ scalar_t operator() (const scalar_t a) const { + return std::abs(a); + } +}; + void abs_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, iter.dtype(), "abs_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return std::abs(a); - }); + gpu_kernel(iter, AbsFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu index 0d8b87f402de..7f5966739c21 100644 --- a/aten/src/ATen/native/cuda/AmpKernels.cu +++ b/aten/src/ATen/native/cuda/AmpKernels.cu @@ -3,9 +3,13 @@ #include #include +#include #include -#include +#include #include +#include +#include + namespace { // Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e, @@ -33,49 +37,136 @@ static __host__ __device__ __forceinline__ int isfinite_ensure_cuda_math(float v namespace at { namespace native { -// Multiplies scaled_grad in-place by inv_scale. If an element of scaled_grad was inf or NaN sets found_inf to 1.0. -// -// Args: -// scaled_grad: A (scaled) gradient tensor. May contain infs or NaNs. -// found_inf: A single-element float tensor to which 1.0 will be written if any gradients contain infs/nans. -// Pre-zeroing found_inf, if appropriate, is the responsibility of the caller. -// inv_scale: The inverse of the scale factor by which scaled_grad is currently multiplied. -// -// Returns: -// A tuple with references to scaled_grad, which is now unscaled in place, and found_inf, -// which is now guaranteed to contain 1.0 if an inf or NaN was found in scaled_grad. +namespace { +// Single-tensor fallback for _amp_foreach_non_finite_check_and_unscale_cuda_. +// Handles individual tensors that are acceptable to unscale but not MTA-safe. void _amp_non_finite_check_and_unscale_cuda_(Tensor& scaled_grad, Tensor& found_inf, const Tensor& inv_scale) { - TORCH_CHECK(scaled_grad.is_cuda(), "scaled_grad must be a CUDA tensor."); + // The only way we reach this function is through _amp_foreach_non_finite_check_and_unscale_cuda_, so no input checks. + + // It's not obvious gpu_kernel always guards onto its argument. Guarding here just in case. + const OptionalDeviceGuard device_guard(device_of(scaled_grad)); + + // Acts on scaled_grad in place. + auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + iter.dtype(), + "_amp_non_finite_check_and_unscale_cuda", + [&iter, &found_inf, &inv_scale] { + auto* found_inf_ptr = found_inf.data_ptr(); + auto* inv_scale_ptr = inv_scale.data_ptr(); + + using opmath_t = get_opmath_t::opmath_t; + + gpu_kernel(iter, + [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (scalar_t val_in) -> scalar_t { + auto val = static_cast(val_in); + if (!isfinite_ensure_cuda_math(val)) { + *found_inf_ptr = 1.f; + } + // Every thread accesses inv_scale, but it will hit in cache. + const auto inv_scale_val = *inv_scale_ptr; + return static_cast(inv_scale_val == 1.f ? val : val * inv_scale_val); + }); + }); +} +} // anonymous namespace + + +// Multiplies each tensor in scaled_grads by inv_scale in-place. +// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf to 1.0. +// Uses multi tensor apply (MTA) to process all MTA-safe tensors. +// +// Args: +// scaled_grads: A TensorList of scaled gradient tensors. May contain infs or NaNs. +// found_inf: A single-element float tensor to which 1.0 will be written if any gradient contain infs/nans. +// Pre-zeroing found_inf, if appropriate, is the responsibility of the caller. +// inv_scale: The inverse of the scale factor by which scaled_grads are currently multiplied. +void _amp_foreach_non_finite_check_and_unscale_cuda_(TensorList scaled_grads, + Tensor& found_inf, + const Tensor& inv_scale) +{ + if (scaled_grads.size() == 0) { + return; + } + TORCH_CHECK(inv_scale.is_cuda(), "inv_scale must be a CUDA tensor."); TORCH_CHECK(found_inf.is_cuda(), "found_inf must be a CUDA tensor."); TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor."); TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor."); TORCH_CHECK(inv_scale.scalar_type() == at::ScalarType::Float, "inv_scale must be a float tensor."); TORCH_CHECK(found_inf.scalar_type() == at::ScalarType::Float, "found_inf must be a float tensor."); - TORCH_CHECK(scaled_grad.layout() == at::kStrided, "scaled_grad must be a strided (not sparse) Tensor."); - // Act on scaled_grad in place. - auto iter = TensorIterator::unary_op(scaled_grad, scaled_grad); + // Ensures client code (GradScaler) filtered scaled_grads by dtype. + check_foreach_api_restrictions(scaled_grads); + + std::vector> tensor_lists; + + // is_non_overlapping_and_dense() is not available in Python. + // GradScaler can't filter for it. We need to filter here. + if (can_use_fast_route(scaled_grads)) { + // Hopefully common case. + // can_use_fast_route is true, which confirms: + // - all scaled_grads are strided + // - all scaled_grads are non overlapping and dense + // - all scaled_grads are on the same device + TORCH_CHECK(scaled_grads[0].is_cuda(), "scaled_grads must be CUDA tensors."); + // Sets up MTA launch to use scaled_grads as-is. + tensor_lists.emplace_back(scaled_grads.vec()); + } else { + // Hopefully uncommon case. + // can_use_fast_route is an all-or-nothing check. In this path it was false, + // so any of the above confirmations could have gone wrong. + // We filter MTA-safe tensors into an MTA-able list. + // If a tensor is acceptable but not MTA-safe, we fall back to the TensorIterator kernel. + // If a tensor is unacceptable, we throw an error to blame GradScaler. + tensor_lists.resize(1); + tensor_lists[0].reserve(scaled_grads.size()); + auto expected_device = scaled_grads[0].device(); + for (const Tensor& t : scaled_grads) { + // Ensures GradScaler filtered scaled_grads by device. + TORCH_CHECK(t.is_cuda(), "one of scaled_grads was not a CUDA tensor."); + TORCH_CHECK(t.device() == expected_device, "scaled_grads must be on the same device."); + TORCH_CHECK(t.layout() == at::kStrided, "one of scaled_grads was not a strided tensor."); + if (!t.is_non_overlapping_and_dense()) { + // t is acceptable but not MTA-safe. Falls back to single-tensor TensorIterator kernel. + _amp_non_finite_check_and_unscale_cuda_(const_cast(t), + found_inf, + inv_scale); + } else { + tensor_lists[0].push_back(t); + } + } + if (tensor_lists[0].size() == 0) { + return; + } + } AT_DISPATCH_FLOATING_TYPES_AND_HALF( - iter.dtype(), - "_amp_non_finite_check_and_unscale_cuda", - [&iter, &found_inf, &inv_scale] { + tensor_lists[0][0].scalar_type(), + "_amp_foreach_non_finite_check_and_unscale_cuda", + [&tensor_lists, &found_inf, &inv_scale] { auto* found_inf_ptr = found_inf.data_ptr(); auto* inv_scale_ptr = inv_scale.data_ptr(); - gpu_kernel(iter, [found_inf_ptr, inv_scale_ptr]GPU_LAMBDA(scalar_t val) -> scalar_t { - float fval = static_cast(val); - // See isfinite_ensure_cuda_math above. - if (!isfinite_ensure_cuda_math(fval)) { - *found_inf_ptr = 1.f; - } - const auto inv_scale_val = *inv_scale_ptr; // Every thread accesses inv_scale, but it will hit in cache. - return static_cast(inv_scale_val == 1.f ? fval : fval*inv_scale_val); - }); + using opmath_t = get_opmath_t::opmath_t; + + // multi_tensor_apply guards onto tensor_lists[0][0], no need to guard explicitly. + multi_tensor_apply<1>(tensor_lists, + UnaryOpFunctor_(), + [found_inf_ptr, inv_scale_ptr] GPU_LAMBDA (opmath_t val) -> opmath_t { + // There is a slight asymmetry here with the TensorIterator kernel above. + // MTA Functors ensure val comes in as opmath_t rather than scalar_t. + if (!isfinite_ensure_cuda_math(val)) { + *found_inf_ptr = 1.f; + } + // Every thread accesses inv_scale, but it will hit in cache. + const auto inv_scale_val = *inv_scale_ptr; + return static_cast(inv_scale_val == 1.f ? val : val * inv_scale_val); + }); }); } diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index 5394c2a23239..e9dfe2d9285d 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include // for USE_MAGMA @@ -116,17 +117,18 @@ void magmaOrgqr( magma_int_t m, magma_int_t n, magma_int_t k, scalar_t* dA, magma_int_t ldda, scalar_t* tau, scalar_t* dT, magma_int_t nb, magma_int_t* info); -template +template void magmaSymeig( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, scalar_t* dA, magma_int_t ldda, - scalar_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork, - magma_int_t* iwork, magma_int_t liwork, magma_int_t* info); + value_t* w, scalar_t* wA, magma_int_t ldwa, scalar_t* work, magma_int_t lwork, value_t* rwork, + magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info); -template +template void magmaSvd( magma_vec_t jobz, magma_int_t m, magma_int_t n, scalar_t* A, - magma_int_t lda, scalar_t* s, scalar_t* U, magma_int_t ldu, + magma_int_t lda, value_t* s, scalar_t* U, magma_int_t ldu, scalar_t* VT, magma_int_t ldvt, scalar_t* work, magma_int_t lwork, + value_t* rwork, magma_int_t* iwork, magma_int_t* info); template @@ -344,6 +346,24 @@ void magmaCholesky( AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaCholesky>( + magma_uplo_t uplo, magma_int_t n, c10::complex* dA, + magma_int_t ldda, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_zpotrf_gpu(uplo, n, reinterpret_cast(dA), ldda, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaCholesky>( + magma_uplo_t uplo, magma_int_t n, c10::complex* dA, + magma_int_t ldda, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_cpotrf_gpu(uplo, n, reinterpret_cast(dA), ldda, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + template<> void magmaCholeskyBatched( magma_uplo_t uplo, magma_int_t n, double** dA_array, magma_int_t ldda, @@ -360,6 +380,22 @@ void magmaCholeskyBatched( AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaCholeskyBatched>( + magma_uplo_t uplo, magma_int_t n, c10::complex** dA_array, magma_int_t ldda, + magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) { + magma_zpotrf_batched(uplo, n, reinterpret_cast(dA_array), ldda, info_array, batchsize, magma_queue.get_queue()); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaCholeskyBatched>( + magma_uplo_t uplo, magma_int_t n, c10::complex** dA_array, magma_int_t ldda, + magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue) { + magma_cpotrf_batched(uplo, n, reinterpret_cast(dA_array), ldda, info_array, batchsize, magma_queue.get_queue()); + AT_CUDA_CHECK(cudaGetLastError()); +} + template<> void magmaTriangularSolve( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, @@ -453,8 +489,10 @@ void magmaOrgqr( template<> void magmaSymeig( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, double* dA, magma_int_t ldda, - double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork, - magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { + double* w, double* wA, magma_int_t ldwa, double* work, magma_int_t lwork, double* rwork, + magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { + (void)rwork; // unused + (void)lrwork; // unused MagmaStreamSyncGuard guard; magma_dsyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info); AT_CUDA_CHECK(cudaGetLastError()); @@ -463,19 +501,46 @@ void magmaSymeig( template<> void magmaSymeig( magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, float* dA, magma_int_t ldda, - float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork, - magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { + float* w, float* wA, magma_int_t ldwa, float* work, magma_int_t lwork, float* rwork, + magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { + (void)rwork; // unused + (void)lrwork; // unused MagmaStreamSyncGuard guard; magma_ssyevd_gpu(jobz, uplo, n, dA, ldda, w, wA, ldwa, work, lwork, iwork, liwork, info); AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaSymeig, double>( + magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, c10::complex* dA, magma_int_t ldda, + double* w, c10::complex* wA, magma_int_t ldwa, c10::complex* work, magma_int_t lwork, double* rwork, + magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_zheevd_gpu( + jobz, uplo, n, reinterpret_cast(dA), ldda, w, reinterpret_cast(wA), + ldwa, reinterpret_cast(work), lwork, rwork, lrwork, iwork, liwork, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaSymeig, float>( + magma_vec_t jobz, magma_uplo_t uplo, magma_int_t n, c10::complex* dA, magma_int_t ldda, + float* w, c10::complex* wA, magma_int_t ldwa, c10::complex* work, magma_int_t lwork, float* rwork, + magma_int_t lrwork, magma_int_t* iwork, magma_int_t liwork, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_cheevd_gpu( + jobz, uplo, n, reinterpret_cast(dA), ldda, w, reinterpret_cast(wA), + ldwa, reinterpret_cast(work), lwork, rwork, lrwork, iwork, liwork, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + template<> void magmaSvd( magma_vec_t jobz, magma_int_t m, magma_int_t n, double* A, magma_int_t lda, double* s, double* U, magma_int_t ldu, double* VT, magma_int_t ldvt, double* work, magma_int_t lwork, - magma_int_t* iwork, magma_int_t* info) { + double *rwork, magma_int_t* iwork, magma_int_t* info) { + (void)rwork; // unused MagmaStreamSyncGuard guard; magma_dgesdd(jobz, m, n, A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info); AT_CUDA_CHECK(cudaGetLastError()); @@ -486,12 +551,43 @@ void magmaSvd( magma_vec_t jobz, magma_int_t m, magma_int_t n, float* A, magma_int_t lda, float* s, float* U, magma_int_t ldu, float* VT, magma_int_t ldvt, float* work, magma_int_t lwork, - magma_int_t* iwork, magma_int_t* info) { + float* rwork, magma_int_t* iwork, magma_int_t* info) { + (void)rwork; // unused MagmaStreamSyncGuard guard; magma_sgesdd(jobz, m, n, A, lda, s, U, ldu, VT, ldvt, work, lwork, iwork, info); AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaSvd, float>( + magma_vec_t jobz, magma_int_t m, magma_int_t n, c10::complex* A, + magma_int_t lda, float* s, c10::complex* U, magma_int_t ldu, + c10::complex* VT, magma_int_t ldvt, c10::complex* work, magma_int_t lwork, + float *rwork, magma_int_t* iwork, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_cgesdd(jobz, m, n, reinterpret_cast(A), lda, s, + reinterpret_cast(U), ldu, + reinterpret_cast(VT), ldvt, + reinterpret_cast(work), lwork, + rwork, iwork, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaSvd, double>( + magma_vec_t jobz, magma_int_t m, magma_int_t n, c10::complex* A, + magma_int_t lda, double* s, c10::complex* U, magma_int_t ldu, + c10::complex* VT, magma_int_t ldvt, c10::complex* work, magma_int_t lwork, + double *rwork, magma_int_t* iwork, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_zgesdd(jobz, m, n, reinterpret_cast(A), lda, s, + reinterpret_cast(U), ldu, + reinterpret_cast(VT), ldvt, + reinterpret_cast(work), lwork, + rwork, iwork, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + template<> void magmaLuSolve( magma_int_t n, magma_int_t nrhs, double* dA, magma_int_t ldda, magma_int_t* ipiv, @@ -904,7 +1000,7 @@ Tensor _cholesky_helper_cuda(const Tensor& self, bool upper) { self_working_copy = cloneBatchedColumnMajor(self); } - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "cholesky_cuda", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "cholesky_cuda", [&]{ apply_cholesky(self_working_copy, false, infos); }); if (self.dim() > 2) { @@ -1201,8 +1297,9 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool AT_ERROR("symeig: MAGMA library not found in " "compilation. Please rebuild with MAGMA."); #else + using value_t = typename c10::scalar_value_type::type; auto self_data = self.data_ptr(); - auto eigvals_data = eigvals.data_ptr(); + auto eigvals_data = eigvals.data_ptr(); auto self_matrix_stride = matrixStride(self); auto eigvals_stride = eigvals.size(-1); int64_t batch_size = batchCount(self); @@ -1223,20 +1320,30 @@ AT_ERROR("symeig: MAGMA library not found in " scalar_t wkopt; magma_int_t liwork = -1; magma_int_t iwkopt; - magmaSymeig(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &iwkopt, liwork, &info); + magma_int_t lrwork = -1; + value_t rwkopt; + magmaSymeig(jobz, uplo, n, self_data, n, eigvals_data, wA, n, &wkopt, lwork, &rwkopt, lrwork, &iwkopt, liwork, &info); scalar_t* work; magma_int_t* iwork; - lwork = magma_int_cast(wkopt, "work_size"); + lwork = magma_int_cast(real_impl(wkopt), "work_size"); liwork = magma_int_cast(iwkopt, "iwork_size"); ALLOCATE_ARRAY(work, scalar_t, lwork); ALLOCATE_ARRAY(iwork, magma_int_t, liwork); + value_t* rwork = nullptr; + c10::Storage storage_rwork; + if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { + lrwork = magma_int_cast(rwkopt, "rwork_size"); + storage_rwork = pin_memory(lrwork); + rwork = static_cast(storage_rwork.data()); + } + for (int64_t i = 0; i < batch_size; i++) { scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; - scalar_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride]; - magmaSymeig(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, - wA, n, work, lwork, iwork, liwork, &info); + value_t* eigvals_working_ptr = &eigvals_data[i * eigvals_stride]; + magmaSymeig(jobz, uplo, n, self_working_ptr, n, eigvals_working_ptr, + wA, n, work, lwork, rwork, lrwork, iwork, liwork, &info); infos[i] = info; if (info != 0) { return; @@ -1250,6 +1357,7 @@ std::tuple _symeig_helper_cuda(const Tensor& self, bool eigenvec auto self_sizes = self.sizes().vec(); self_sizes.pop_back(); + ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); // magmaSymeig uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors. // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues. @@ -1257,15 +1365,15 @@ std::tuple _symeig_helper_cuda(const Tensor& self, bool eigenvec // In the case where self.numel() == 0, we just return an empty tensor of // dimensions on the CUDA (to avoid the unnecessary "to(at::kCUDA)") auto eigvals_working_copy = self.numel() == 0 - ? at::empty(self_sizes, self.options()) - : at::empty(self_sizes, self.options().device(at::kCPU)); + ? at::empty(self_sizes, self.options().dtype(dtype)) + : at::empty(self_sizes, self.options().dtype(dtype).device(at::kCPU)); if (self.numel() == 0) { return std::tuple(eigvals_working_copy, at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT)); } auto self_working_copy = cloneBatchedColumnMajor(self); - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "symeig_cuda", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "symeig_cuda", [&]{ apply_symeig(self_working_copy, eigvals_working_copy, eigenvectors, upper, infos); }); @@ -1290,9 +1398,10 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT, AT_ERROR("svd: MAGMA library not found in " "compilation. Please rebuild with MAGMA."); #else + using value_t = typename c10::scalar_value_type::type; auto self_data = self.data_ptr(); auto U_data = U.data_ptr(); - auto S_data = S.data_ptr(); + auto S_data = S.data_ptr(); auto VT_data = VT.data_ptr(); auto self_stride = matrixStride(self); auto U_stride = matrixStride(U); @@ -1304,7 +1413,18 @@ AT_ERROR("svd: MAGMA library not found in " magma_int_t m = magma_int_cast(self.size(-2), "m"); magma_int_t n = magma_int_cast(self.size(-1), "n"); - auto k = std::min(m, n); + auto mn = std::min(m, n); + + c10::Storage storage_rwork; + value_t* rwork = nullptr; + + magma_int_t* iwork; + ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn); + if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { + auto lrwork = computeLRWorkDim(jobchar, m, n); + storage_rwork = pin_memory(lrwork); + rwork = static_cast(storage_rwork.data()); + } magma_int_t info = 0; // Run once, first to get the optimum work size. @@ -1313,22 +1433,20 @@ AT_ERROR("svd: MAGMA library not found in " // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty() magma_int_t lwork = -1; scalar_t wkopt; - magma_int_t* iwork; - ALLOCATE_ARRAY(iwork, magma_int_t, 8 * k); - magmaSvd(jobz, m, n, self_data, m, S_data, U_data, m, VT_data, n, &wkopt, lwork, iwork, &info); - lwork = magma_int_cast(wkopt, "work_size"); + magmaSvd(jobz, m, n, self_data, m, S_data, U_data, m, VT_data, n, &wkopt, lwork, rwork, iwork, &info); + lwork = magma_int_cast(real_impl(wkopt), "work_size"); scalar_t* work; ALLOCATE_ARRAY(work, scalar_t, lwork); for (int64_t i = 0; i < batchsize; i++) { scalar_t* self_working_ptr = &self_data[i * self_stride]; - scalar_t* S_working_ptr = &S_data[i * S_stride]; + value_t* S_working_ptr = &S_data[i * S_stride]; scalar_t* U_working_ptr = &U_data[i * U_stride]; scalar_t* VT_working_ptr = &VT_data[i * VT_stride]; // Compute S, U (optionally), VT (optionally) - magmaSvd(jobz, m, n, self_working_ptr, m, - S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work, lwork, iwork, &info); + magmaSvd(jobz, m, n, self_working_ptr, m, + S_working_ptr, U_working_ptr, m, VT_working_ptr, n, work, lwork, rwork, iwork, &info); infos[i] = info; if (info != 0) { return; @@ -1361,7 +1479,7 @@ std::tuple _svd_helper_cuda(const Tensor& self, bool som at::TensorOptions(at::kCPU).dtype(self.dtype()).pinned_memory(true)); self_working_copy.copy_(self); - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "svd_cuda", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "svd_cuda", [&] { apply_svd(self_working_copy, U_working_copy, S_working_copy, VT_working_copy, jobchar, infos); }); diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h index 85014c5773ee..dc6dc2f9daca 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebraLib.h @@ -7,8 +7,8 @@ #include #include -#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000 -// some cusolver functions doesn't work well on cuda 9.2, cusolver is used on cuda >= 10.0 +#if defined(CUDART_VERSION) && defined(CUSOLVER_VERSION) && CUSOLVER_VERSION >= 10200 +// some cusolver functions don't work well on cuda 9.2 or cuda 10.1.105, cusolver is used on cuda >= 10.1.243 #define USE_CUSOLVER #endif diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu index f05d73453dcf..864fb0a848df 100644 --- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu @@ -8,12 +8,20 @@ namespace at { namespace native { +template +struct AddFunctor { + AddFunctor(scalar_t a): alpha(a) {} + __device__ __forceinline__ scalar_t operator() (const scalar_t a, const scalar_t b) const { + return a + alpha * b; + } + private: + scalar_t alpha; +}; + void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() { - auto alpha = alpha_scalar.to(); - gpu_kernel_with_scalars(iter, [alpha]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a + alpha * b; - }); + AddFunctor f(alpha_scalar.to()); + gpu_kernel_with_scalars(iter, f); }); } diff --git a/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu index 128c05bed3cb..30894b568762 100644 --- a/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryBitwiseOpsKernels.cu @@ -9,60 +9,67 @@ namespace at { namespace native { -void bitwise_and_kernel_cuda(TensorIterator& iter) { - if (iter.dtype() == ScalarType::Bool) { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(bool a, bool b) { - return a && b; - }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_and_cuda", [&]() { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a & b; - }); - }); +template +struct BitwiseAndFunctor { + __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const { + return a & b; + } +}; + +template<> +struct BitwiseAndFunctor { + __device__ __forceinline__ bool operator()(bool a, bool b) const { + return a && b; } +}; + +void bitwise_and_kernel_cuda(TensorIterator& iter) { + AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_and_cuda", [&]() { + BitwiseAndFunctor f; + gpu_kernel_with_scalars(iter, f); + }); } -void bitwise_or_kernel_cuda(TensorIterator& iter) { - if (iter.dtype() == ScalarType::Bool) { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(bool a, bool b) { - return a || b; - }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_or_cuda", [&]() { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a | b; - }); - }); +template +struct BitwiseOrFunctor { + __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const { + return a | b; + } +}; + +template<> +struct BitwiseOrFunctor { + __device__ __forceinline__ bool operator()(bool a, bool b) const { + return a || b; } +}; + +void bitwise_or_kernel_cuda(TensorIterator& iter) { + AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_or_cuda", [&]() { + BitwiseOrFunctor f; + gpu_kernel_with_scalars(iter, f); + }); } -void bitwise_xor_kernel_cuda(TensorIterator& iter) { - if (iter.dtype() == ScalarType::Bool) { - // Boolean type does not work with ^ (bitwise XOR) in C++. bitwise_xor wraps this operation for both Boolean and - // integral types. - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(bool a, bool b) { - return a != b; - }); - } else { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "bitwise_xor_cuda", [&]() { - gpu_kernel_with_scalars( - iter, - []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a ^ b; - }); - }); +template +struct BitwiseXorFunctor { + __device__ __forceinline__ scalar_t operator()(scalar_t a, scalar_t b) const { + return a ^ b; } +}; + +template<> +struct BitwiseXorFunctor { + __device__ __forceinline__ bool operator()(bool a, bool b) const { + return a != b; + } +}; + +void bitwise_xor_kernel_cuda(TensorIterator& iter) { + AT_DISPATCH_INTEGRAL_TYPES_AND(kBool, iter.dtype(), "bitwise_xor_cuda", [&]() { + BitwiseXorFunctor f; + gpu_kernel_with_scalars(iter, f); + }); } REGISTER_DISPATCH(bitwise_and_stub, &bitwise_and_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu index 2a9b188520f5..de11baa28210 100644 --- a/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryLogicalOpsKernels.cu @@ -10,7 +10,8 @@ namespace at { namespace native { void logical_and_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_and_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, + iter.common_dtype(), "logical_and_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return a && b; }); @@ -18,7 +19,8 @@ void logical_and_kernel_cuda(TensorIterator& iter) { } void logical_or_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_or_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, + iter.common_dtype(), "logical_or_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return a || b; }); @@ -26,7 +28,8 @@ void logical_or_kernel_cuda(TensorIterator& iter) { } void logical_xor_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.common_dtype(), "logical_xor_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, ScalarType::BFloat16, + iter.common_dtype(), "logical_xor_cuda", [&]() { gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { return bool(a) != bool(b); }); diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu index a2ffdb75c84b..fc9aa74f91f4 100644 --- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu @@ -19,11 +19,12 @@ void atan2_kernel_cuda(TensorIterator& iter) { }); } -void smooth_l1_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&]() { - gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t { +void smooth_l1_kernel_cuda(TensorIterator& iter, double beta) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "smooth_l1_cuda", [&iter, beta]() { + scalar_t beta_val(beta); + gpu_kernel(iter, [beta_val] GPU_LAMBDA (scalar_t a, scalar_t b) -> scalar_t { auto z = ::abs(a - b); - return z < scalar_t(1.) ? scalar_t(0.5) * z * z : z - scalar_t(0.5); + return z < beta_val ? scalar_t(0.5) * z * z / beta_val : z - scalar_t(0.5) * beta_val; }); }); } diff --git a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu index 044fc955b954..be3f4f0bb01e 100644 --- a/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryMulDivKernel.cu @@ -11,6 +11,39 @@ namespace at { namespace native { +template +struct MulScalarFunctor { + MulScalarFunctor(accscalar_t b_): b(b_) {} + __device__ scalar_t operator() (scalar_t a) const { + return a * b; + } + private: + accscalar_t b; +}; + +template +struct DivFunctor { + __device__ scalar_t operator() (scalar_t a, scalar_t b) const { + return a / b; + } +}; + +template +struct MulFunctor { + __device__ scalar_t operator() (scalar_t a, scalar_t b) const { + return a * b; + } +}; + +// Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] +template<> +struct MulFunctor { + __device__ bool operator() (bool a, bool b) const { + return a && b; + } +}; + + void div_kernel_cuda(TensorIterator& iter) { if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && iter.is_cpu_scalar(2)) { // optimization for floating-point types: if the second operand is a CPU @@ -20,44 +53,35 @@ void div_kernel_cuda(TensorIterator& iter) { using accscalar_t = at::acc_type; auto inv_b = accscalar_t(1.0) / iter.scalar_value(2); iter.remove_operand(2); - gpu_kernel(iter, [inv_b]GPU_LAMBDA(scalar_t a) -> scalar_t { - return a * inv_b; - }); + MulScalarFunctor f(inv_b); + gpu_kernel(iter, f); }); } else { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "div_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a / b; - }); + DivFunctor f; + gpu_kernel_with_scalars(iter, f); }); } } void mul_kernel_cuda(TensorIterator& iter) { - if (iter.common_dtype() == ScalarType::Bool) { - // Workaround for the error: '*' in boolean context, suggest '&&' instead [-Werror=int-in-bool-context] - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(bool a, bool b) -> bool { - return a && b; - }); - } else if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && + if (!isIntegralType(iter.common_dtype(), /*includeBool*/ false) && (iter.is_cpu_scalar(1) || iter.is_cpu_scalar(2))) { - //if common dtype is half the scalar constant can overflow in half precision, and yet the result can - //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() { - using accscalar_t = at::acc_type; - int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2; - auto b = iter.scalar_value(scalar_arg); - iter.remove_operand(scalar_arg); - const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1))); - gpu_kernel(iter, [b]GPU_LAMBDA(scalar_t a) -> scalar_t { - return a * b; - }); - }); + //if common dtype is half the scalar constant can overflow in half precision, and yet the result can + //still be representable in the half dtype. Cast scalar to acc_type to have better accuracy + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() { + using accscalar_t = at::acc_type; + int scalar_arg = iter.is_cpu_scalar(1) ? 1 : 2; + auto b = iter.scalar_value(scalar_arg); + iter.remove_operand(scalar_arg); + const cuda::OptionalCUDAGuard device_guard(device_of(iter.tensor(1))); + MulScalarFunctor f(b); + gpu_kernel(iter, f); + }); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, iter.common_dtype(), "mul_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a * b; - }); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "mul_cuda", [&]() { + MulFunctor f; + gpu_kernel_with_scalars(iter, f); }); } } diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu index 947b53bce8fd..20f76ce0d8e1 100644 --- a/aten/src/ATen/native/cuda/CompareEQKernel.cu +++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu @@ -10,11 +10,16 @@ namespace at { namespace native { +template +struct CompareEqFunctor { + __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const { + return a == b; + } +}; + void eq_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "eq_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a == b; - }); + gpu_kernel_with_scalars(iter, CompareEqFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/CompareGEKernel.cu b/aten/src/ATen/native/cuda/CompareGEKernel.cu index e276237ea8e6..c96b7f3929bc 100644 --- a/aten/src/ATen/native/cuda/CompareGEKernel.cu +++ b/aten/src/ATen/native/cuda/CompareGEKernel.cu @@ -10,11 +10,16 @@ namespace at { namespace native { +template +struct CompareGEFunctor { + __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const { + return a >= b; + } +}; + void ge_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ge_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a >= b; - }); + gpu_kernel_with_scalars(iter, CompareGEFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/CompareGTKernel.cu b/aten/src/ATen/native/cuda/CompareGTKernel.cu index c17b14855dd6..cbd189ed1b6d 100644 --- a/aten/src/ATen/native/cuda/CompareGTKernel.cu +++ b/aten/src/ATen/native/cuda/CompareGTKernel.cu @@ -10,11 +10,16 @@ namespace at { namespace native { +template +struct CompareGTFunctor { + __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const { + return a > b; + } +}; + void gt_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "gt_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a > b; - }); + gpu_kernel_with_scalars(iter, CompareGTFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/CompareLEKernel.cu b/aten/src/ATen/native/cuda/CompareLEKernel.cu index 3987b87e918c..13e60a78ffb2 100644 --- a/aten/src/ATen/native/cuda/CompareLEKernel.cu +++ b/aten/src/ATen/native/cuda/CompareLEKernel.cu @@ -10,11 +10,16 @@ namespace at { namespace native { +template +struct CompareLEFunctor { + __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const { + return a <= b; + } +}; + void le_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "le_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a <= b; - }); + gpu_kernel_with_scalars(iter, CompareLEFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/CompareLTKernel.cu b/aten/src/ATen/native/cuda/CompareLTKernel.cu index 3684d65f6631..e301284c83e7 100644 --- a/aten/src/ATen/native/cuda/CompareLTKernel.cu +++ b/aten/src/ATen/native/cuda/CompareLTKernel.cu @@ -10,11 +10,16 @@ namespace at { namespace native { +template +struct CompareLTFunctor { + __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const { + return a < b; + } +}; + void lt_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "lt_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a < b; - }); + gpu_kernel_with_scalars(iter, CompareLTFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/CompareNEKernel.cu b/aten/src/ATen/native/cuda/CompareNEKernel.cu index 0834a0d2b3bb..3ef397ec5200 100644 --- a/aten/src/ATen/native/cuda/CompareNEKernel.cu +++ b/aten/src/ATen/native/cuda/CompareNEKernel.cu @@ -10,11 +10,16 @@ namespace at { namespace native { +template +struct CompareNEFunctor { + __device__ __forceinline__ bool operator() (scalar_t a, scalar_t b) const { + return a != b; + } +}; + void ne_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.common_dtype(), "ne_cuda", [&]() { - gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> bool { - return a != b; - }); + gpu_kernel_with_scalars(iter, CompareNEFunctor()); }); } diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index c629dfc4030c..3e0e70c01952 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -366,70 +366,68 @@ void max_pool2d_with_indices_out_cuda_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] { - using accscalar_t = acc_type; - - scalar_t *output_data = output.data_ptr(); - scalar_t *input_data = input.data_ptr(); - int64_t *indices_data = indices.data_ptr(); - - switch (memory_format) { - case MemoryFormat::ChannelsLast: { - const int max_threads = std::min( - at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); - int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; - int block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); - int block_y = std::min( - maxThreadsDim[1], std::min(lastPow2(outputWidth), max_threads / block_x)); - int block_z = std::min( - maxThreadsDim[2], std::min(lastPow2(outputHeight), max_threads / block_x / block_y)); - block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); - const dim3 block(block_x, block_y, block_z); - - int kernel_stride_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * 4); - int kernel_size_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * kernel_stride_C); - - int grid_x = nbatch*kernel_stride_C; - int grid_y = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[1], - cuda::ATenCeilDiv(safe_downcast(outputWidth), block_y*BLOCK_STRIDE)); - int grid_z = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[2], - cuda::ATenCeilDiv(safe_downcast(outputHeight), block_z*BLOCK_STRIDE)); - const dim3 grid(grid_x, grid_y, grid_z); - - size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t)); - AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); - - max_pool_forward_nhwc - <<>>( - input_data, nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - in_stride_n, in_stride_c, - in_stride_h, in_stride_w, - kernel_stride_C, kernel_size_C, - output_data, indices_data); - break; - } - case MemoryFormat::Contiguous: { - const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, - BLOCK_THREADS); - max_pool_forward_nchw - <<>>( - count, input_data, - nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - output_data, indices_data); - break; - } - default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + using accscalar_t = acc_type; + + scalar_t *output_data = output.data_ptr(); + scalar_t *input_data = input.data_ptr(); + int64_t *indices_data = indices.data_ptr(); + + switch (memory_format) { + case MemoryFormat::ChannelsLast: { + const int max_threads = std::min( + at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); + int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; + int block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); + int block_y = std::min( + maxThreadsDim[1], std::min(lastPow2(outputWidth), max_threads / block_x)); + int block_z = std::min( + maxThreadsDim[2], std::min(lastPow2(outputHeight), max_threads / block_x / block_y)); + block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); + const dim3 block(block_x, block_y, block_z); + + int kernel_stride_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * 4); + int kernel_size_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * kernel_stride_C); + + int grid_x = nbatch*kernel_stride_C; + int grid_y = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[1], + cuda::ATenCeilDiv(safe_downcast(outputWidth), block_y*BLOCK_STRIDE)); + int grid_z = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[2], + cuda::ATenCeilDiv(safe_downcast(outputHeight), block_z*BLOCK_STRIDE)); + const dim3 grid(grid_x, grid_y, grid_z); + + size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * (sizeof(int) + sizeof(scalar_t)); + AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); + + max_pool_forward_nhwc + <<>>( + input_data, nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + in_stride_n, in_stride_c, + in_stride_h, in_stride_w, + kernel_stride_C, kernel_size_C, + output_data, indices_data); + break; } - }); + case MemoryFormat::Contiguous: { + const int num_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, + BLOCK_THREADS); + max_pool_forward_nchw + <<>>( + count, input_data, + nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + output_data, indices_data); + break; + } + default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } } ); @@ -532,88 +530,86 @@ void max_pool2d_with_indices_backward_out_cuda_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool2d_with_indices_out_cuda_frame", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool2d_with_indices_out_cuda_frame", [&] { - using accscalar_t = acc_type; - - scalar_t *gradOutput_data = gradOutput.data_ptr(); - scalar_t *gradInput_data = gradInput.data_ptr(); - int64_t *indices_data = indices.data_ptr(); - - switch (memory_format) { - case MemoryFormat::ChannelsLast: { - const int max_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); - int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; - int block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); - int block_y = std::min( - maxThreadsDim[1], std::min(lastPow2(inputWidth), max_threads / block_x)); - int block_z = std::min( - maxThreadsDim[2], std::min(lastPow2(inputHeight), max_threads / block_x / block_y)); - block_x = std::min( - maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); - const dim3 block(block_x, block_y, block_z); - - int kernel_stride_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * 4); - int kernel_size_C = cuda::ATenCeilDiv( - safe_downcast(nInputPlane), block_x * kernel_stride_C); - - int grid_x = nbatch*kernel_stride_C; - int grid_y = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[1], - cuda::ATenCeilDiv(safe_downcast(inputWidth), block_y*BLOCK_STRIDE)); - int grid_z = std::min( - at::cuda::getCurrentDeviceProperties()->maxGridSize[2], - cuda::ATenCeilDiv(safe_downcast(inputHeight), block_z*BLOCK_STRIDE)); - const dim3 grid(grid_x, grid_y, grid_z); - - size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t); - AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); - - // The backward kernel is launched on input instead output. - // If it is launched on output layer, atomic_add would not provide much benefit on FP16. - // Please check comments at https://github.com/pytorch/pytorch/pull/34519. - max_pool_backward_nhwc - <<>>( - count, - gradOutput_data, - indices_data, - nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - out_stride_c, out_stride_h, out_stride_w, - in_stride_n, in_stride_c, - in_stride_h, in_stride_w, - kernel_stride_C, kernel_size_C, - gradInput_data); - break; - } - case MemoryFormat::Contiguous: { - int imgcount = inputWidth * inputHeight; - dim3 grid; - const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS; - grid.x = blocks; - grid.y = nbatch; - uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; - if (maxGridY < grid.y) grid.y = maxGridY; - grid.z = nInputPlane; - uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; - if (maxGridZ < grid.z) grid.z = maxGridZ; - - max_pool_backward_nchw - <<>>( - count, - gradOutput_data, - indices_data, - nbatch, - nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, - kH, kW, dH, dW, padH, padW, dilationH, dilationW, - gradInput_data); - break; - } - default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + using accscalar_t = acc_type; + + scalar_t *gradOutput_data = gradOutput.data_ptr(); + scalar_t *gradInput_data = gradInput.data_ptr(); + int64_t *indices_data = indices.data_ptr(); + + switch (memory_format) { + case MemoryFormat::ChannelsLast: { + const int max_threads = std::min(at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, CUDA_MAX_THREADS); + int* maxThreadsDim = at::cuda::getCurrentDeviceProperties()->maxThreadsDim; + int block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), at::cuda::warp_size())); + int block_y = std::min( + maxThreadsDim[1], std::min(lastPow2(inputWidth), max_threads / block_x)); + int block_z = std::min( + maxThreadsDim[2], std::min(lastPow2(inputHeight), max_threads / block_x / block_y)); + block_x = std::min( + maxThreadsDim[0], std::min(lastPow2(nInputPlane), max_threads / block_y / block_z)); + const dim3 block(block_x, block_y, block_z); + + int kernel_stride_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * 4); + int kernel_size_C = cuda::ATenCeilDiv( + safe_downcast(nInputPlane), block_x * kernel_stride_C); + + int grid_x = nbatch*kernel_stride_C; + int grid_y = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[1], + cuda::ATenCeilDiv(safe_downcast(inputWidth), block_y*BLOCK_STRIDE)); + int grid_z = std::min( + at::cuda::getCurrentDeviceProperties()->maxGridSize[2], + cuda::ATenCeilDiv(safe_downcast(inputHeight), block_z*BLOCK_STRIDE)); + const dim3 grid(grid_x, grid_y, grid_z); + + size_t shmem_size = (kernel_size_C * block_x*block_y*block_z) * sizeof(accscalar_t); + AT_ASSERT(shmem_size <= at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock); + + // The backward kernel is launched on input instead output. + // If it is launched on output layer, atomic_add would not provide much benefit on FP16. + // Please check comments at https://github.com/pytorch/pytorch/pull/34519. + max_pool_backward_nhwc + <<>>( + count, + gradOutput_data, + indices_data, + nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + out_stride_c, out_stride_h, out_stride_w, + in_stride_n, in_stride_c, + in_stride_h, in_stride_w, + kernel_stride_C, kernel_size_C, + gradInput_data); + break; } - }); + case MemoryFormat::Contiguous: { + int imgcount = inputWidth * inputHeight; + dim3 grid; + const int blocks = (imgcount + BLOCK_THREADS - 1) / BLOCK_THREADS; + grid.x = blocks; + grid.y = nbatch; + uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + if (maxGridY < grid.y) grid.y = maxGridY; + grid.z = nInputPlane; + uint64_t maxGridZ = at::cuda::getCurrentDeviceProperties()->maxGridSize[2]; + if (maxGridZ < grid.z) grid.z = maxGridZ; + + max_pool_backward_nchw + <<>>( + count, + gradOutput_data, + indices_data, + nbatch, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + gradInput_data); + break; + } + default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } } ); diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu index 2b0ba37c8880..9d72e0027007 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu @@ -276,20 +276,18 @@ void max_pool3d_with_indices_out_cuda_template( input.scalar_type(), "max_pool3d_with_indices_out_frame", [&]{ - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_out_frame", [&] { - scalar_t *input_data = work_input.data_ptr(); - int64_t totalZ = otime * nslices * nbatch; - - max_pool3d_with_indices_out_frame( - input_data, work_output, work_indices, - totalZ, - itime, iheight, iwidth, - otime, oheight, owidth, - kT, kH, kW, - dT, dH, dW, - pT, pH, pW, - dilationT, dilationH, dilationW); - }); + scalar_t *input_data = work_input.data_ptr(); + int64_t totalZ = otime * nslices * nbatch; + + max_pool3d_with_indices_out_frame( + input_data, work_output, work_indices, + totalZ, + itime, iheight, iwidth, + otime, oheight, owidth, + kT, kH, kW, + dT, dH, dW, + pT, pH, pW, + dilationT, dilationH, dilationW); } ); } @@ -387,19 +385,17 @@ void max_pool3d_with_indices_backward_out_cuda_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_pool3d_with_indices_backward_out_frame", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "max_pool3d_with_indices_backward_out_frame", [&] { - const int64_t totalZ = otime * nslices * nbatch; - scalar_t *grad_input_data = work_grad_input.data_ptr(); - - max_pool3d_with_indices_backward_out_frame( - grad_input_data, work_grad_output, work_indices, - totalZ, - itime, iheight, iwidth, - oheight, owidth, - dT, dH, dW, - pT, pH, pW, - dilationT, dilationH, dilationW); - }); + const int64_t totalZ = otime * nslices * nbatch; + scalar_t *grad_input_data = work_grad_input.data_ptr(); + + max_pool3d_with_indices_backward_out_frame( + grad_input_data, work_grad_output, work_indices, + totalZ, + itime, iheight, iwidth, + oheight, owidth, + dT, dH, dW, + pT, pH, pW, + dilationT, dilationH, dilationW); } ); } diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu index 385cac5c79e8..c43a2ae9877e 100644 --- a/aten/src/ATen/native/cuda/DistanceKernel.cu +++ b/aten/src/ATen/native/cuda/DistanceKernel.cu @@ -50,7 +50,9 @@ struct dists { // Special case backward when p is less than two struct lt_two { - static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); } + static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { + return (dist == 0.0 || (diff == 0.0 && p < 1)) ? 0 : (sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1)); + } }; // Two norm diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu index 7376ecfa6394..e4fe4b68f2eb 100644 --- a/aten/src/ATen/native/cuda/FillKernel.cu +++ b/aten/src/ATen/native/cuda/FillKernel.cu @@ -6,12 +6,19 @@ namespace at { namespace native { +template +struct FillFunctor { + FillFunctor(scalar_t v): value(v) {} + __device__ __forceinline__ scalar_t operator() () const { + return value; + } + private: + scalar_t value; +}; + void fill_kernel_cuda(TensorIterator& iter, Scalar value) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "fill_cuda", [&]() { - auto value_converted = value.to(); - gpu_kernel(iter, [value_converted]GPU_LAMBDA() -> scalar_t { - return value_converted; - }); + gpu_kernel(iter, FillFunctor(value.to())); }); } diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu index 239859b9138c..cc01bb030cf4 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu @@ -6,8 +6,9 @@ namespace at { namespace native { template class Op> std::vector foreach_tensor_list_op(TensorList tensors1, TensorList tensors2, Scalar alpha = 1) { - std::vector> tensor_lists; + std::vector> tensor_lists; std::vector vec_res; + vec_res.reserve(tensors1.size()); for (const auto& t: tensors1) { vec_res.emplace_back(at::native::empty_like(t)); } @@ -17,7 +18,11 @@ std::vector foreach_tensor_list_op(TensorList tensors1, TensorList tenso tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda", [&]() { - multi_tensor_apply<3>(tensor_lists, BinaryOpListAlphaFunctor(), alpha.to()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<3>(tensor_lists, + BinaryOpListAlphaFunctor(), + Op(), + alpha.to()); }); return tensor_lists[2]; @@ -25,12 +30,16 @@ std::vector foreach_tensor_list_op(TensorList tensors1, TensorList tenso template class Op> void foreach_tensor_list_op_(TensorList tensors1, TensorList tensors2, Scalar alpha = 1) { - std::vector> tensor_lists; + std::vector> tensor_lists; tensor_lists.emplace_back(tensors1.vec()); tensor_lists.emplace_back(tensors2.vec()); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors1[0].scalar_type(), "foreach_binary_op_list_cuda_", [&]() { - multi_tensor_apply<2>(tensor_lists, BinaryOpListAlphaFunctor_(), alpha.to()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + BinaryOpListAlphaFunctor_(), + Op(), + alpha.to()); }); } diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu index 215410bbc2a5..71180785eb48 100644 --- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu @@ -8,8 +8,9 @@ template class Op> std::vector foreach_binary_op(TensorList tensors, Scalar scalar) { check_foreach_api_restrictions(tensors); - std::vector> tensor_lists; + std::vector> tensor_lists; std::vector vec_res; + vec_res.reserve(tensors.size()); for (const auto& t: tensors) { vec_res.emplace_back(at::native::empty_like(t)); } @@ -18,7 +19,11 @@ std::vector foreach_binary_op(TensorList tensors, Scalar scalar) { tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda", [&]() { - multi_tensor_apply<2>(tensor_lists, BinaryOpScalarFunctor(), scalar.to()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + BinaryOpScalarFunctor(), + Op(), + scalar.to()); }); return tensor_lists[1]; } @@ -27,11 +32,15 @@ template class Op> void foreach_binary_op_(TensorList tensors, Scalar scalar) { check_foreach_api_restrictions(tensors); - std::vector> tensor_lists; + std::vector> tensor_lists; tensor_lists.emplace_back(tensors.vec()); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalar_cuda_", [&]() { - multi_tensor_apply<1>(tensor_lists, BinaryOpScalarFunctor_(), scalar.to()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<1>(tensor_lists, + BinaryOpScalarFunctor_(), + Op(), + scalar.to()); }); } diff --git a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu new file mode 100644 index 000000000000..60f2bb737bf7 --- /dev/null +++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu @@ -0,0 +1,69 @@ +#include +#include +#include + +namespace at { namespace native { + +template class Op> +std::vector foreach_binary_op(TensorList tensors, at::ArrayRef scalars) { + std::vector> tensor_lists; + std::vector vec_res; + vec_res.reserve(tensors.size()); + for (const auto& t: tensors) { + vec_res.emplace_back(at::native::empty_like(t)); + } + + tensor_lists.emplace_back(tensors.vec()); + tensor_lists.emplace_back(vec_res); + + AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + scalars, + BinaryOpScalarListFunctor(), + Op()); + }); + return tensor_lists[1]; +} + +template class Op> +void foreach_binary_op_(TensorList tensors, at::ArrayRef scalars) { + std::vector> tensor_lists; + tensor_lists.emplace_back(tensors.vec()); + + AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, tensors[0].scalar_type(), "foreach_binary_op_scalarlist_cuda_", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<1>(tensor_lists, + scalars, + BinaryOpScalarListFunctor_(), + Op()); + }); +} + +#define FOREACH_BINARY_OP_SCALARLIST(NAME, OP) \ +void foreach_tensor_##NAME##_scalarlist_kernel_cuda_(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors); \ + \ + if (!can_use_fast_route(tensors, scalars)) { \ + return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow_(tensors, scalars); \ + } \ + \ + foreach_binary_op_(tensors, scalars); \ +} \ + \ +std::vector foreach_tensor_##NAME##_scalarlist_kernel_cuda(TensorList tensors, at::ArrayRef scalars) { \ + check_foreach_api_restrictions(tensors); \ + \ + if (!can_use_fast_route(tensors, scalars)) { \ + return at::native::foreach_tensor_##NAME##_scalarlist_kernel_slow(tensors, scalars); \ + } \ + \ + return foreach_binary_op(tensors, scalars); \ +} + +FOREACH_BINARY_OP_SCALARLIST(add, std::plus); +FOREACH_BINARY_OP_SCALARLIST(sub, std::minus); +FOREACH_BINARY_OP_SCALARLIST(mul, std::multiplies); +FOREACH_BINARY_OP_SCALARLIST(div, std::divides); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/ForeachFunctors.cuh b/aten/src/ATen/native/cuda/ForeachFunctors.cuh index a04d27110c9a..dd01d584f045 100644 --- a/aten/src/ATen/native/cuda/ForeachFunctors.cuh +++ b/aten/src/ATen/native/cuda/ForeachFunctors.cuh @@ -5,12 +5,19 @@ namespace at { namespace native { namespace { -template class Op> +// For FP16 or BFloat16 inputs, ops should perform internal math in FP32. +template struct get_opmath_t { using opmath_t = scalar_t; }; +template<> struct get_opmath_t { using opmath_t = float; }; +template<> struct get_opmath_t { using opmath_t = float; }; + +template struct BinaryOpScalarFunctor_ { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata<1>& tl, - T scalar) { + Op op, + opmath_t scalar) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -29,7 +36,8 @@ struct BinaryOpScalarFunctor_ { load_store(r_x, x, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), scalar); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + static_cast(scalar))); } // store load_store(x, r_x, i_start, 0); @@ -47,7 +55,8 @@ struct BinaryOpScalarFunctor_ { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), scalar); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + static_cast(scalar))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -60,12 +69,14 @@ struct BinaryOpScalarFunctor_ { } }; -template class Op> +template struct BinaryOpScalarFunctor { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata<2>& tl, - T scalar) { + Op op, + opmath_t scalar) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -87,7 +98,8 @@ struct BinaryOpScalarFunctor { load_store(r_x, x, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), scalar); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + static_cast(scalar))); } // store load_store(out, r_x, i_start, 0); @@ -105,7 +117,8 @@ struct BinaryOpScalarFunctor { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), scalar); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + static_cast(scalar))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -118,12 +131,135 @@ struct BinaryOpScalarFunctor { } }; -template class Op> +template +struct BinaryOpScalarListFunctor_ { + using io_t = T; + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( + int chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + int tensor_loc = tl.block_to_tensor[blockIdx.x]; + int chunk_idx = tl.block_to_chunk[blockIdx.x]; + int n = tl.sizes[tensor_loc]; + + T* x = (T*)tl.addresses[0][tensor_loc]; + x += chunk_idx * chunk_size; + + opmath_t y = tl.scalar_vals[tensor_loc]; + + n -= chunk_idx * chunk_size; + + T r_x[kILP]; + + // to make things simple, we put aligned case in a different code path + if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x)) { + for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) { + // load + load_store(r_x, x, 0 , i_start); +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = static_cast(op(static_cast(r_x[ii]), y)); + } + // store + load_store(x, r_x, i_start, 0); + } + } + else { + for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = 0; + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) { + r_x[ii] = x[i]; + } + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = static_cast(op(static_cast(r_x[ii]), y)); + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) + x[i] = r_x[ii]; + } + } + } + } +}; + +template +struct BinaryOpScalarListFunctor { + using io_t = T; + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( + int chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + int tensor_loc = tl.block_to_tensor[blockIdx.x]; + int chunk_idx = tl.block_to_chunk[blockIdx.x]; + int n = tl.sizes[tensor_loc]; + + T* x = (T*)tl.addresses[0][tensor_loc]; + x += chunk_idx * chunk_size; + + T* out = (T*)tl.addresses[1][tensor_loc]; + out += chunk_idx * chunk_size; + + opmath_t y = tl.scalar_vals[tensor_loc]; + + n -= chunk_idx * chunk_size; + + T r_x[kILP]; + + // to make things simple, we put aligned case in a different code path + if(n % kILP == 0 && chunk_size % kILP == 0 && is_aligned(x) && is_aligned(out)) { + for(int i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; i_start += blockDim.x) { + // load + load_store(r_x, x, 0 , i_start); +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = static_cast(op(static_cast(r_x[ii]), y)); + } + // store + load_store(out, r_x, i_start, 0); + } + } + else { + for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = 0; + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) { + r_x[ii] = x[i]; + } + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + r_x[ii] = static_cast(op(static_cast(r_x[ii]), y)); + } +#pragma unroll + for(int ii = 0; ii < kILP; ii++) { + int i = i_start + threadIdx.x + ii * blockDim.x; + if(i < n && i < chunk_size) + out[i] = r_x[ii]; + } + } + } + } +}; + +template struct BinaryOpListAlphaFunctor_ { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, - TensorListMetadata<2>& tl, - T alpha) { + TensorListMetadata<2>& tl, + Op op, + opmath_t alpha) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -147,7 +283,8 @@ struct BinaryOpListAlphaFunctor_ { load_store(r_y, y, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), alpha * static_cast(r_y[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + alpha * static_cast(r_y[ii]))); } // store load_store(x, r_x, i_start , 0); @@ -167,7 +304,8 @@ struct BinaryOpListAlphaFunctor_ { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), alpha * static_cast(r_y[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + alpha * static_cast(r_y[ii]))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -180,12 +318,14 @@ struct BinaryOpListAlphaFunctor_ { } }; -template class Op> +template struct BinaryOpListAlphaFunctor { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata<3>& tl, - T alpha) { + Op op, + opmath_t alpha) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -212,7 +352,8 @@ struct BinaryOpListAlphaFunctor { load_store(r_y, y, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), alpha * static_cast(r_y[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + alpha * static_cast(r_y[ii]))); } // store load_store(out, r_x, i_start , 0); @@ -232,7 +373,8 @@ struct BinaryOpListAlphaFunctor { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii]), alpha * static_cast(r_y[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]), + alpha * static_cast(r_y[ii]))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -245,11 +387,13 @@ struct BinaryOpListAlphaFunctor { } }; -template class Op> +template struct UnaryOpFunctor_ { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, - TensorListMetadata<1>& tl) { + TensorListMetadata<1>& tl, + Op op) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -268,7 +412,7 @@ struct UnaryOpFunctor_ { load_store(r_x, x, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]))); } // store load_store(x, r_x, i_start, 0); @@ -286,7 +430,7 @@ struct UnaryOpFunctor_ { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -299,11 +443,13 @@ struct UnaryOpFunctor_ { } }; -template class Op> +template struct UnaryOpFunctor { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, - TensorListMetadata<2>& tl) { + TensorListMetadata<2>& tl, + Op op) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -325,7 +471,7 @@ struct UnaryOpFunctor { load_store(r_x, x, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]))); } // store load_store(out, r_x, i_start, 0); @@ -343,7 +489,7 @@ struct UnaryOpFunctor { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = Op()(static_cast(r_x[ii])); + r_x[ii] = static_cast(op(static_cast(r_x[ii]))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -356,12 +502,14 @@ struct UnaryOpFunctor { } }; -template class Op> +template struct PointwiseOpFunctor_ { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata<3>& tl, - T scalar) { + Op op, + opmath_t scalar) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -390,7 +538,9 @@ struct PointwiseOpFunctor_ { load_store(r_z, z, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = static_cast(r_x[ii]) + scalar * Op()(static_cast(r_y[ii]), static_cast(r_z[ii])); + r_x[ii] = static_cast(static_cast(r_x[ii]) + + scalar * op(static_cast(r_y[ii]), + static_cast(r_z[ii]))); } // store load_store(x, r_x, i_start, 0); @@ -412,7 +562,9 @@ struct PointwiseOpFunctor_ { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = static_cast(r_x[ii]) + scalar * Op()(static_cast(r_y[ii]), static_cast(r_z[ii])); + r_x[ii] = static_cast(static_cast(r_x[ii]) + + scalar * op(static_cast(r_y[ii]), + static_cast(r_z[ii]))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { @@ -425,12 +577,14 @@ struct PointwiseOpFunctor_ { } }; -template class Op> +template struct PointwiseOpFunctor { - __device__ void operator() ( + using opmath_t = typename get_opmath_t::opmath_t; + template __device__ __forceinline__ void operator() ( int chunk_size, TensorListMetadata<4>& tl, - T scalar) { + Op op, + opmath_t scalar) { int tensor_loc = tl.block_to_tensor[blockIdx.x]; int chunk_idx = tl.block_to_chunk[blockIdx.x]; int n = tl.sizes[tensor_loc]; @@ -462,7 +616,9 @@ struct PointwiseOpFunctor { load_store(r_z, z, 0 , i_start); #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = static_cast(r_x[ii]) + scalar * Op()(static_cast(r_y[ii]), static_cast(r_z[ii])); + r_x[ii] = static_cast(static_cast(r_x[ii]) + + scalar * op(static_cast(r_y[ii]), + static_cast(r_z[ii]))); } // store load_store(out, r_x, i_start, 0); @@ -485,7 +641,9 @@ struct PointwiseOpFunctor { } #pragma unroll for(int ii = 0; ii < kILP; ii++) { - r_x[ii] = static_cast(r_x[ii]) + scalar * Op()(static_cast(r_y[ii]), static_cast(r_z[ii])); + r_x[ii] = static_cast(static_cast(r_x[ii]) + + scalar * op(static_cast(r_y[ii]), + static_cast(r_z[ii]))); } #pragma unroll for(int ii = 0; ii < kILP; ii++) { diff --git a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu index b514f3294c52..7ce2fc566110 100644 --- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu +++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu @@ -6,8 +6,9 @@ namespace at { namespace native { template class Op> std::vector foreach_pointwise_op(TensorList input, TensorList tensors1, TensorList tensors2, Scalar scalar) { - std::vector> tensor_lists; + std::vector> tensor_lists; std::vector vec_res; + vec_res.reserve(input.size()); for (const auto& t: input) { vec_res.emplace_back(at::native::empty_like(t)); } @@ -18,7 +19,11 @@ std::vector foreach_pointwise_op(TensorList input, TensorList tensors1, tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op_cuda", [&]() { - multi_tensor_apply<4>(tensor_lists, PointwiseOpFunctor(), scalar.to()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<4>(tensor_lists, + PointwiseOpFunctor(), + Op(), + scalar.to()); }); return tensor_lists[3]; @@ -26,13 +31,17 @@ std::vector foreach_pointwise_op(TensorList input, TensorList tensors1, template class Op> void foreach_pointwise_op_(TensorList input, TensorList tensors1, TensorList tensors2, Scalar scalar) { - std::vector> tensor_lists; + std::vector> tensor_lists; tensor_lists.emplace_back(input.vec()); tensor_lists.emplace_back(tensors1.vec()); tensor_lists.emplace_back(tensors2.vec()); AT_DISPATCH_ALL_TYPES_AND(kHalf, input[0].scalar_type(), "foreach_pointwise_op__cuda", [&]() { - multi_tensor_apply<3>(tensor_lists, PointwiseOpFunctor_(), scalar.to()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<3>(tensor_lists, + PointwiseOpFunctor_(), + Op(), + scalar.to()); }); } @@ -59,7 +68,7 @@ void foreach_tensor_##NAME##_cuda_(TensorList input, TensorList tensors1, Tensor if (!can_use_fast_route(input, scalar) || \ !can_use_fast_route(tensors1, tensors2) || \ !can_use_fast_route(input, tensors1)) { \ - at::native::foreach_tensor_##NAME##_slow_(input, tensors1, tensors2, scalar); \ + return at::native::foreach_tensor_##NAME##_slow_(input, tensors1, tensors2, scalar); \ } \ \ foreach_pointwise_op_(input, tensors1, tensors2, scalar); \ diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu index 32bb6ab6b509..1160d64bba6d 100644 --- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu +++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu @@ -6,8 +6,9 @@ namespace at { namespace native { template class Op> std::vector foreach_unary_op(TensorList tensors) { - std::vector> tensor_lists; + std::vector> tensor_lists; std::vector vec_res; + vec_res.reserve(tensors.size()); for (const auto& t: tensors) { vec_res.emplace_back(at::native::empty_like(t)); } @@ -16,18 +17,24 @@ std::vector foreach_unary_op(TensorList tensors) { tensor_lists.emplace_back(std::move(vec_res)); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { - multi_tensor_apply<2>(tensor_lists, UnaryOpFunctor()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + UnaryOpFunctor(), + Op()); }); return tensor_lists[1]; } template class Op> void foreach_unary_op_(TensorList tensors) { - std::vector> tensor_lists; + std::vector> tensor_lists; tensor_lists.emplace_back(tensors.vec()); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { - multi_tensor_apply<1>(tensor_lists, UnaryOpFunctor_()); + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<1>(tensor_lists, + UnaryOpFunctor_(), + Op()); }); } diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index b69267e90437..9c3eab4497aa 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -4,9 +4,12 @@ #include #include #include -#include +#include #include +#include +#include #include +#include namespace at { namespace native { @@ -14,6 +17,54 @@ static constexpr int launch_bound2 = 4; static constexpr int launch_size_nd = 128; +template +__device__ __forceinline__ IndexType indexToOffset( + const cuda::detail::TensorInfo& info, + int64_t index, + IndexType size) { + IndexType linearIndex = static_cast(index); + CUDA_KERNEL_ASSERT(linearIndex < size && linearIndex >= -size); + if (linearIndex < 0) { + linearIndex += size; + } + return cuda::detail::IndexToOffset::get(linearIndex, info); +} + +template +void dispatchTakePutImpl(const Tensor& input, Tensor& output, const Tensor& index) { + auto inputInfo = cuda::detail::getTensorInfo(input); + inputInfo.collapseDims(); + auto numel = input.numel(); + if (inputInfo.isContiguous()) { + cuda::CUDA_tensor_apply2( + output, + index, + [inputInfo, numel] __device__ ( + T & out, const int64_t& idx) { + auto offset = indexToOffset<-2, T, IndexType>(inputInfo, idx, numel); + out = inputInfo.data[offset]; + }); + } else { + cuda::CUDA_tensor_apply2( + output, + index, + [inputInfo, numel] __device__ ( + T & out, const int64_t& idx) { + auto offset = indexToOffset<-1, T, IndexType>(inputInfo, idx, numel); + out = inputInfo.data[offset]; + }); + } +} + +template +void dispatchTakePut(const Tensor& input, Tensor& output, const Tensor& index) { + if (cuda::detail::canUse32BitIndexMath(input)) { + dispatchTakePutImpl(input, output, index); + } else { + dispatchTakePutImpl(input, output, index); + } +} + template C10_LAUNCH_BOUNDS_2(nt, launch_bound2) __global__ void index_elementwise_kernel(int N, func_t f) { @@ -154,6 +205,48 @@ Tensor & masked_select_out_cuda(Tensor & result, const Tensor & self, const Tens return masked_select_out_cuda_impl(result, self, mask); } +void take_out_cuda_template(Tensor& output, const Tensor& input, const Tensor& index) { + TORCH_CHECK(output.device().type() == at::kCUDA, "device type of output (", output.device().type(), ") is not GPU"); + TORCH_CHECK(input.device().type() == at::kCUDA, "device type of input (", input.device().type(), ") is not GPU"); + TORCH_CHECK(index.device().type() == at::kCUDA, "device type of index (", index.device().type(), ") is not GPU"); + + TORCH_CHECK(output.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", output.layout(), " on output tensor"); + TORCH_CHECK(input.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", input.layout(), " on input tensor"); + TORCH_CHECK(index.layout() == Layout::Strided, "take() only supports strided layout, got layout: ", index.layout(), " on index tensor"); + + TORCH_CHECK(output.scalar_type() == input.scalar_type(), + "output and input scalar type must match. but got different types: ", output.scalar_type(), " and ", input.scalar_type()); + TORCH_CHECK(index.scalar_type() == kLong, "index must be an int64 tensor"); + + TensorArg output_arg{ output, "output", 1 }; + TensorArg input_arg{ input, "input", 2 }; + TensorArg index_arg{ index, "index", 3 }; + checkAllSameGPU("take", {output_arg, input_arg, index_arg}); + + TORCH_CHECK(input.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING); + TORCH_CHECK(output.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING); + TORCH_CHECK(index.dim() < MAX_CUTORCH_DIMS, CUTORCH_DIM_WARNING); + + TORCH_CHECK(!(input.numel() == 0 && index.numel() != 0), "tried to take from an empty tensor"); + + output.resize_(index.sizes()); + + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cuda", [&] { + dispatchTakePut(input, output, index); + }); +} + +Tensor take_cuda(const Tensor& self, const Tensor& index) { + auto out = at::empty(index.sizes(), self.options()); + take_out_cuda_template(out, self, index); + return out; +} + +Tensor& take_out_cuda(Tensor& out, const Tensor& self, const Tensor& index) { + take_out_cuda_template(out, self, index); + return out; +} + REGISTER_DISPATCH(index_stub, &index_kernel); REGISTER_DISPATCH(index_put_stub, &index_put_kernel); diff --git a/aten/src/ATen/native/cuda/LinearAlgebra.cu b/aten/src/ATen/native/cuda/LinearAlgebra.cu index c78029d6a7e0..76f5c0a99efe 100644 --- a/aten/src/ATen/native/cuda/LinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/LinearAlgebra.cu @@ -178,120 +178,6 @@ Tensor& addmm__cuda(Tensor& self, const Tensor& mat1, const Tensor& mat2, return self; } -template -void addr_impl_ger_cuda(Tensor &out, const Tensor &self, - const Tensor& vec1, const Tensor& vec2, - scalar_t alpha, scalar_t beta) { - static_assert(std::is_same::value || - std::is_same::value, - "addr_impl_ger_cuda: only float and double are supported"); - if (&out != &self) { - at::native::resize_as_(out, self); - at::native::copy_(out, self); - } - if (beta == 0.0) { - at::native::zero_(out); - } - if (beta != 1.0) { - at::native::mul_(out, beta); - } - if (out.stride(0) == 1) { - at::cuda::blas::ger( - vec1.size(0), vec2.size(0), alpha, - vec1.data_ptr(), vec1.stride(0), - vec2.data_ptr(), vec2.stride(0), - out.data_ptr(), out.stride(1) - ); - } else if (out.stride(1) == 1) { - at::cuda::blas::ger( - vec2.size(0), vec1.size(0), alpha, - vec2.data_ptr(), vec2.stride(0), - vec1.data_ptr(), vec1.stride(0), - out.data_ptr(), out.stride(0) - ); - } else { - Tensor cr = out.clone(); - at::cuda::blas::ger( - vec2.size(0), vec1.size(0), alpha, - vec2.data_ptr(), vec2.stride(0), - vec1.data_ptr(), vec1.stride(0), - out.data_ptr(), out.stride(0) - ); - out.set_(cr); - } -} - -template -void addr_impl_cuda(Tensor &out, const Tensor &self, - const Tensor& vec1, const Tensor& vec2, - scalar_t alpha, scalar_t beta) { - // currently no Hger/SgerEx in Cublas. - Tensor vec2T = vec2.reshape({1, vec2.size(0)}); - Tensor vec1M = vec1.reshape({vec1.size(0), 1}); - addmm_out_cuda(out, self, vec1M, vec2T, beta, alpha); -} -template<> -void addr_impl_cuda(Tensor &out, const Tensor &self, - const Tensor& vec1, const Tensor& vec2, - float alpha, float beta) { - addr_impl_ger_cuda(out, self, vec1, vec2, alpha, beta); -} -template<> -void addr_impl_cuda(Tensor &out, const Tensor &self, - const Tensor& vec1, const Tensor& vec2, - double alpha, double beta) { - addr_impl_ger_cuda(out, self, vec1, vec2, alpha, beta); -} - -Tensor& addr_out_cuda(Tensor &out, const Tensor& self, - const Tensor& vec1, const Tensor& vec2, - Scalar beta, Scalar alpha) { - TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1, - "vec1 and vec2 should be 1-dimensional vectors. Got dimensions ", - vec1.dim(), " and ", vec2.dim()); - - Tensor self_; - if (&out != &self) { - std::tie(self_) = expand_size(self, {vec1.size(0), vec2.size(0)}, "addr"); - } else { - self_ = self; - } - - TORCH_CHECK(out.device() == self_.device() && - out.device() == vec1.device() && - out.device() == vec2.device(), - "Expected all tensors to be on the same device. Found: ", - out.device(), ", ", self_.device(), ", ", - vec1.device(), " and ", vec2.device()); - TORCH_CHECK(self_.dim() == 2, - "2D tensor expected, got ", self_.dim(), "D tensor for input"); - TORCH_CHECK(self_.size(0) == vec1.size(0) && self_.size(1) == vec2.size(0), - "size mismatch", - ", input: ", self_.sizes(), - ", v1: ", vec1.sizes(), - ", v2: ", vec2.sizes()); - AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self_.scalar_type(), "addr_out_cuda", [&] { - addr_impl_cuda(out, self_, vec1, vec2, - alpha.to(), beta.to()); - }); - return out; -} - -Tensor& addr__cuda(Tensor& self, - const Tensor& vec1, const Tensor& vec2, - Scalar beta, Scalar alpha) { - addr_out_cuda(self, self, vec1, vec2, beta, alpha); - return self; -} - -Tensor addr_cuda(const Tensor& self, - const Tensor& vec1, const Tensor& vec2, - Scalar beta, Scalar alpha) { - Tensor out = at::empty({0}, self.options()); - addr_out_cuda(out, self, vec1, vec2, beta, alpha); - return out; -} - Tensor& addbmm_out_cuda(Tensor& out, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) { diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index bb913dc0ec9e..412f6b70c2c5 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -140,7 +140,6 @@ struct BUnaryFunctor { template void gpu_kernel_with_scalars(TensorIterator& iter, const func_t& f) { - ASSERT_HOST_DEVICE_LAMBDA(func_t); TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); using traits = function_traits; diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh index f82a0d9a58c8..f0f8f97fabb1 100644 --- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh +++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh @@ -26,6 +26,7 @@ __device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int s // TensorListMetadata has to be < 4KB - the limit for kernel launch argument static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; +static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30}; template struct TensorListMetadata { @@ -35,25 +36,95 @@ template struct TensorListMetadata int block_to_chunk[depth_to_max_blocks[n-1]]; }; +template struct TensorListScalarListMetadata +{ + void* addresses[n][depth_to_max_tensors_scalarlist[n-1]]; + int sizes[depth_to_max_tensors_scalarlist[n-1]]; + scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n-1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n-1]]; + int block_to_chunk[depth_to_max_blocks[n-1]]; +}; + template C10_LAUNCH_BOUNDS_1(kBlockSize) -__global__ void +__global__ void multi_tensor_apply_kernel( T tensorListMeta, U callable, ArgTypes... args) { // Hand the chunk information to the user-supplied functor to process however it likes. - callable(kChunkSize, tensorListMeta, args...); + callable(kChunkSize, tensorListMeta, args...); } template void multi_tensor_apply( std::vector>& tensor_lists, + at::ArrayRef scalars, T callable, ArgTypes... args) { TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth."); const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); + size_t n_tensors = tensor_lists[0].size(); + using scalar_vals_t = typename T::opmath_t; + TensorListScalarListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for(size_t t = 0; t < n_tensors; t++) { + + tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t]; + + tensorListMeta.sizes[loc_tensor_info] = tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr(); + } + loc_tensor_info++; + + int chunks = (tensor_lists[0][t].numel() + kChunkSize - 1)/kChunkSize; + for (int chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + bool tensors_full = (loc_tensor_info == depth_to_max_tensors_scalarlist[depth-1] && + chunk == chunks - 1); + bool blocks_full = (loc_block_info == depth_to_max_blocks[depth-1]); + bool last_chunk = (t == n_tensors - 1 && chunk == chunks - 1); + + if (tensors_full || blocks_full || last_chunk) { + multi_tensor_apply_kernel<<>>( + tensorListMeta, + callable, + args...); + + AT_CUDA_CHECK(cudaGetLastError()); + + // Reset. + loc_block_info = 0; + if(chunk == chunks - 1) { + loc_tensor_info = 0; + } + else { + tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1]; + tensorListMeta.scalar_vals[0] = tensorListMeta.scalar_vals[loc_tensor_info-1]; + for(int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = tensorListMeta.addresses[d][loc_tensor_info-1]; + } + loc_tensor_info = 1; + } + } + } + } + } + + +template +void multi_tensor_apply( + std::vector>& tensor_lists, + T callable, + ArgTypes... args) { + TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth."); + const cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); size_t n_tensors = tensor_lists[0].size(); TensorListMetadata tensorListMeta; @@ -88,7 +159,7 @@ void multi_tensor_apply( // Reset. loc_block_info = 0; if(chunk == chunks - 1) { - loc_tensor_info = 0; + loc_tensor_info = 0; } else { tensorListMeta.sizes[0] = tensorListMeta.sizes[loc_tensor_info-1]; diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu index 10138f4bced0..13149759926d 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu @@ -684,10 +684,7 @@ void slow_conv_transpose2d_acc_grad_parameters_cuda_template( // Matrix mulitply per output: input_n = input.select(0, elt); - if (kernel_height == 1 && kernel_width == 1) { - // for 1x1 column skip im2col step - columns.copy_(grad_output_n); - } else { + if (kernel_height != 1 || kernel_width != 1) { // Extract columns: im2col( at::cuda::getCurrentCUDAStream(), diff --git a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu index 501ef90477da..33162b3d5271 100644 --- a/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu +++ b/aten/src/ATen/native/cuda/PointwiseOpsKernel.cu @@ -26,17 +26,18 @@ void addcdiv_cuda_kernel(TensorIterator& iter, Scalar value) { }); } -void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm) { - AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&]() { +void smooth_l1_backward_cuda_kernel(TensorIterator& iter, Scalar norm, double beta) { + AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "smooth_l1_backward_cuda", [&iter, &norm, beta] { auto norm_val = norm.to(); - gpu_kernel(iter, [norm_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t { + scalar_t beta_val(beta); + gpu_kernel(iter, [norm_val, beta_val]GPU_LAMBDA(scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t { const auto x = input - target; - if (x < scalar_t(-1)) + if (x < -beta_val) return -norm_val * grad_output; - else if (x > scalar_t(1)) + else if (x > beta_val) return norm_val * grad_output; else - return norm_val * x * grad_output; + return norm_val * x * grad_output / beta_val; }); }); } diff --git a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu index 83d11ed9f9e1..cb070e15f191 100644 --- a/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceMinMaxKernel.cu @@ -13,20 +13,32 @@ namespace at { namespace native { +template +struct MaxNanFunctor { + __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const { + return (THCNumerics::isnan(a) || a > b) ? a : b; + } +}; + template void max_values_kernel_cuda_impl(TensorIterator& iter) { gpu_reduce_kernel( - iter, func_wrapper ([]GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { - return (THCNumerics::isnan(a) || a > b) ? a : b; - }), at::numeric_limits::lower_bound()); + iter, func_wrapper (MaxNanFunctor()), + at::numeric_limits::lower_bound()); } +template +struct MinNanFunctor { + __device__ __forceinline__ acc_t operator()(acc_t a, acc_t b) const { + return (THCNumerics::isnan(a) || a < b) ? a : b; + } +}; + template void min_values_kernel_cuda_impl(TensorIterator& iter) { gpu_reduce_kernel( - iter, func_wrapper ([]GPU_LAMBDA(acc_t a, acc_t b) -> acc_t { - return (THCNumerics::isnan(a) || a < b) ? a : b; - }), at::numeric_limits::upper_bound()); + iter, func_wrapper (MinNanFunctor()), + at::numeric_limits::upper_bound()); } void max_values_kernel_cuda(TensorIterator& iter) { diff --git a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu index eb9250befd56..6046bc9a1f01 100644 --- a/aten/src/ATen/native/cuda/UnaryFractionKernels.cu +++ b/aten/src/ATen/native/cuda/UnaryFractionKernels.cu @@ -114,7 +114,7 @@ __host__ __device__ static inline c10::complex nearbyint_wrapper(c10::co } void round_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.dtype(), "round_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "round_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { // We do not use std::round because we would like to round midway numbers to the nearest even integer. return nearbyint_wrapper(a); diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu index 1067d7c61bc5..5b545471fb34 100644 --- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -180,6 +181,32 @@ void clamp_max_kernel_cuda(TensorIterator& iter, Scalar max_value) { }); } +void nan_to_num_kernel_cuda( + TensorIterator& iter, + c10::optional nan, + c10::optional pos_inf, + c10::optional neg_inf) { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "nan_to_num_cuda", [&]() { + scalar_t nan_replacement = static_cast(nan.value_or(0.)); + scalar_t pos_inf_replacement = pos_inf.has_value() + ? static_cast(pos_inf.value()) + : std::numeric_limits::max(); + scalar_t neg_inf_replacement = neg_inf.has_value() + ? static_cast(neg_inf.value()) + : std::numeric_limits::lowest(); + gpu_kernel(iter, [=] GPU_LAMBDA(scalar_t a) -> scalar_t { + return ( + at::_isnan(a) + ? nan_replacement + : (a == std::numeric_limits::infinity() + ? pos_inf_replacement + : (a == -std::numeric_limits::infinity() + ? neg_inf_replacement + : a))); + }); + }); +} + void kaiser_window_kernel_cuda(TensorIterator& iter, int64_t window_length, double beta){ AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "kaiser_window_cuda", [&](){ AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "kaiser_window_cuda", [&] { @@ -206,6 +233,7 @@ REGISTER_DISPATCH(erfinv_stub, &erfinv_kernel_cuda); REGISTER_DISPATCH(clamp_stub, &clamp_kernel_cuda); REGISTER_DISPATCH(clamp_min_stub, &clamp_min_kernel_cuda); REGISTER_DISPATCH(clamp_max_stub, &clamp_max_kernel_cuda); +REGISTER_DISPATCH(nan_to_num_stub, &nan_to_num_kernel_cuda); REGISTER_DISPATCH(kaiser_window_stub, &kaiser_window_kernel_cuda); } // namespace native diff --git a/aten/src/ATen/native/cuda/UnarySignKernels.cu b/aten/src/ATen/native/cuda/UnarySignKernels.cu index 3d90089556be..aae3906575f9 100644 --- a/aten/src/ATen/native/cuda/UnarySignKernels.cu +++ b/aten/src/ATen/native/cuda/UnarySignKernels.cu @@ -11,21 +11,19 @@ namespace at { namespace native { void logical_not_kernel_cuda(TensorIterator& iter) { - // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND2(...) + // error check -- this is just ensuring we don't dispatch on types that aren't in ALL_TYPES_AND_COMPLEX_AND3(...) // so we don't have to maintain a separate list or to do double dispatch. - AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(0), "logical_not_cuda", [&]() {}); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(0), "logical_not_cuda", [&]() {}); - AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, iter.dtype(1), "logical_not_cuda", [&]() { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, iter.dtype(1), "logical_not_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> bool { return !a; }); }); } void neg_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "neg_cuda", [&]() { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "neg_cuda", [&] { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { - return -a; - }); + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return -a; }); }); } @@ -51,9 +49,26 @@ void signbit_kernel_cuda(TensorIterator& iter){ }); } +template +__host__ __device__ static inline c10::complex sgn_wrapper(c10::complex z) { + if (z == c10::complex(0, 0)) { + return c10::complex(0, 0); + } else { + return z / std::abs(z); + } +} + +void sgn_kernel_cuda(TensorIterator& iter){ + AT_DISPATCH_COMPLEX_TYPES(iter.dtype(), "sgn_cuda", [&]() { + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { + return sgn_wrapper(a); + }); + }); +} REGISTER_DISPATCH(logical_not_stub, &logical_not_kernel_cuda); REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda); REGISTER_DISPATCH(sign_stub, &sign_kernel_cuda); REGISTER_DISPATCH(signbit_stub, &signbit_kernel_cuda); +REGISTER_DISPATCH(sgn_stub, &sgn_kernel_cuda); }} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv.cpp index 4ddd533ec8f8..4524af2fe244 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -100,6 +101,10 @@ std::tuple cudnn_convolution_transpose_backward( // if(dataType == CUDNN_DATA_HALF) // AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); // +// Update: AT_CUDNN_CHECK is updated with AT_CUDNN_CHECK_WITH_SHAPES, which +// automatically prints tensor shapes and convolution parameters if there is +// a cuDNN exception thrown. +// // When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it informs // cudnnGet/cudnnFind to iterate/take into account both tensor core and non-tensor-core algos. // If you don't call cudnnSetConvolutionMathType before calling cudnnGet/cudnnFind, @@ -220,6 +225,19 @@ struct ConvolutionParams // forward and backward, so you can reuse the benchmark entry, }; +std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) { + out << "ConvolutionParams \n" + << " data_type = " << cudnnTypeToString(params.dataType) << "\n" + << " padding = " << ArrayRef{params.padding} << "\n" + << " stride = " << ArrayRef{params.stride} << "\n" + << " dilation = " << ArrayRef{params.dilation} << "\n" + << " groups = " << params.groups << "\n" + << " deterministic = " << (params.deterministic ? "true" : "false") << "\n" + << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n"; + + return out; +} + // NB: This can't be a constructor, because then ConvolutionParams // would not be a POD anymore. // TODO: Use TensorGeometry here instead of the entire Tensor, which we @@ -268,6 +286,61 @@ struct ConvolutionArgs { } }; +std::string repro_from_args(const ConvolutionArgs& args) { + auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; }; + std::string partial_dtype; + switch (args.params.dataType) { + case CUDNN_DATA_FLOAT: partial_dtype = "float"; break; + case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break; + case CUDNN_DATA_HALF: partial_dtype = "half"; break; + default: partial_dtype = "unsupported"; + } + const std::string full_dtype = "torch." + partial_dtype; + const int out_channels = args.weight.sizes()[0]; + const int in_channels = args.weight.sizes()[1] * args.params.groups; + const size_t dim = args.input.sizes().size(); + const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d"; + const std::string to_channels_last = args.input.suggest_memory_format() == at::MemoryFormat::ChannelsLast \ + ? ".to(memory_format=torch." + channels_last_xd + ")" : ""; + + std::ostringstream ss; + ss << "You can try to repro this exception using the following code snippet. "; + ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n"; + ss << "import torch\n"; + ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n"; + ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; + ss << "torch.backends.cudnn.deterministic = " << pybool(args.params.deterministic) << "\n"; + ss << "torch.backends.cudnn.allow_tf32 = " << pybool(args.params.allow_tf32) << "\n"; + ss << "data = torch.randn(" << args.input.sizes() << ", dtype=" << full_dtype << ", "; + ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n"; + ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", "; + ss << "kernel_size=" << args.weight.sizes().slice(2) << ", "; + ss << "padding=" << ArrayRef(args.params.padding, dim-2) << ", "; + ss << "stride=" << ArrayRef(args.params.stride, dim-2) << ", "; + ss << "dilation=" << ArrayRef(args.params.dilation, dim-2) << ", "; + ss << "groups=" << args.params.groups << ")\n"; + ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n"; + ss << "out = net(data)\n"; + ss << "out.backward(torch.randn_like(out))\n"; + ss << "torch.cuda.synchronize()\n\n"; + + return ss.str(); +} + +std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) { + out << repro_from_args(args) // already has a trailing newline + << args.params // already has a trailing newline + << "input: " << args.idesc // already has a trailing newline + << "output: " << args.odesc // already has a trailing newline + << "weight: " << args.wdesc // already has a trailing newline + << "Pointer addresses: " << "\n" + << " input: " << args.input.data_ptr() << "\n" + << " output: " << args.output.data_ptr() << "\n" + << " weight: " << args.weight.data_ptr() << "\n"; + + return out; +} + // --------------------------------------------------------------------- // // Benchmarking @@ -457,7 +530,7 @@ struct algorithm_search { int perf_count; std::unique_ptr perf_results(new perf_t[num_algos]); if (!benchmark) { - AT_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardAlgorithm_v7( args.handle, args.idesc.desc(), args.wdesc.desc(), @@ -465,11 +538,11 @@ struct algorithm_search { args.odesc.desc(), num_algos, &perf_count, - perf_results.get())); + perf_results.get()), args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); - AT_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx( args.handle, args.idesc.desc(), args.input.data_ptr(), args.wdesc.desc(), args.weight.data_ptr(), @@ -479,7 +552,7 @@ struct algorithm_search { &perf_count, perf_results.get(), ws.data, - ws.size)); + ws.size), args); // Free the cached blocks in our caching allocator. They are // needed here because the above benchmarking uses a huge amount of memory, @@ -493,14 +566,14 @@ struct algorithm_search { const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize) { - AT_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardWorkspaceSize( args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.odesc.desc(), algo, - workspaceSize)); + workspaceSize), args); } }; @@ -527,7 +600,7 @@ struct algorithm_search { int perf_count; std::unique_ptr perf_results(new perf_t[num_algos]); if (!benchmark) { - AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(), @@ -535,11 +608,11 @@ struct algorithm_search { args.idesc.desc(), num_algos, &perf_count, - perf_results.get())); + perf_results.get()), args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); - AT_CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx( args.handle, args.wdesc.desc(), args.weight.data_ptr(), args.odesc.desc(), args.output.data_ptr(), @@ -549,7 +622,7 @@ struct algorithm_search { &perf_count, perf_results.get(), ws.data, - ws.size)); + ws.size), args); // Free the cached blocks in our caching allocator. They are // needed here because the above benchmarking uses a huge amount of memory, @@ -563,14 +636,14 @@ struct algorithm_search { const ConvolutionArgs& args, cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize) { - AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataWorkspaceSize( args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), algo, - workspaceSize)); + workspaceSize), args); } }; @@ -599,7 +672,7 @@ struct algorithm_search { std::unique_ptr perf_results(new perf_t[num_algos]); int perf_count; if (!benchmark) { - AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.handle, args.idesc.desc(), args.odesc.desc(), @@ -607,11 +680,11 @@ struct algorithm_search { args.wdesc.desc(), num_algos, &perf_count, - perf_results.get())); + perf_results.get()), args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); - AT_CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx( args.handle, args.idesc.desc(), args.input.data_ptr(), args.odesc.desc(), args.output.data_ptr(), @@ -621,7 +694,7 @@ struct algorithm_search { &perf_count, perf_results.get(), ws.data, - ws.size)); + ws.size), args); // Free the cached blocks in our caching allocator. They are // needed here because the above benchmarking uses a huge amount of memory, @@ -633,14 +706,14 @@ struct algorithm_search { static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize) { - AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterWorkspaceSize( args.handle, args.idesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.wdesc.desc(), algo, - workspaceSize)); + workspaceSize), args); } }; @@ -850,17 +923,18 @@ void raw_cudnn_convolution_forward_out_32bit( // whether to use Tensor core kernels or not // See Note [behavior of cudnnFind and cudnnGet] ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType); - AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType)); + AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType), args); Constant one(dataType, 1); Constant zero(dataType, 0); - AT_CUDNN_CHECK(cudnnConvolutionForward( - args.handle, - &one, args.idesc.desc(), input.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), - args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory, - &zero, args.odesc.desc(), output.data_ptr())); + AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionForward( + args.handle, + &one, args.idesc.desc(), input.data_ptr(), + args.wdesc.desc(), weight.data_ptr(), + args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory, + &zero, args.odesc.desc(), output.data_ptr()), + args, "Forward algorithm: ", static_cast(fwdAlgPerf.algo), "\n"); } ); } @@ -986,17 +1060,22 @@ void raw_cudnn_convolution_backward_input_out_32bit( // whether to use Tensor core kernels or not // See Note [behavior of cudnnFind and cudnnGet] ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType); - AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType)); + AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType), args); Constant one(dataType, 1); Constant zero(dataType, 0); - AT_CUDNN_CHECK(cudnnConvolutionBackwardData( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardData( args.handle, &one, args.wdesc.desc(), weight.data_ptr(), args.odesc.desc(), grad_output.data_ptr(), args.cdesc.desc(), bwdDataAlgPerf.algo, workspace.data_ptr(), bwdDataAlgPerf.memory, - &zero, args.idesc.desc(), grad_input.data_ptr())); + &zero, args.idesc.desc(), grad_input.data_ptr()), + args, + "Additional pointer addresses: \n", + " grad_output: ", grad_output.data_ptr(), "\n", + " grad_input: ", grad_input.data_ptr(), "\n", + "Backward data algorithm: ", static_cast(bwdDataAlgPerf.algo), "\n"); } ); } @@ -1148,17 +1227,22 @@ void raw_cudnn_convolution_backward_weight_out_32bit( // whether to use Tensor core kernels or not // See Note [behavior of cudnnFind and cudnnGet] ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType); - AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType)); + AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType), args); Constant one(dataType, 1); Constant zero(dataType, 0); - AT_CUDNN_CHECK(cudnnConvolutionBackwardFilter( + AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardFilter( args.handle, &one, args.idesc.desc(), input.data_ptr(), args.odesc.desc(), grad_output.data_ptr(), args.cdesc.desc(), bwdFilterAlgPerf.algo, workspace.data_ptr(), bwdFilterAlgPerf.memory, - &zero, args.wdesc.desc(), grad_weight.data_ptr())); + &zero, args.wdesc.desc(), grad_weight.data_ptr()), + args, + "Additional pointer addresses: \n", + " grad_output: ", grad_output.data_ptr(), "\n", + " grad_weight: ", grad_weight.data_ptr(), "\n", + "Backward filter algorithm: ", static_cast(bwdFilterAlgPerf.algo), "\n"); } ); } diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 5be7d6eea8ea..aa99490deb2d 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -587,25 +587,60 @@ namespace { } } - cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){ - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - const int64_t bsize = tensors.mini_batch; - //excluding Turing from using persistent rnn. - if (prop->major == 7 && prop->minor != 5 && getCudnnDataType(input) == CUDNN_DATA_HALF && !tensors.is_input_packed()) { - if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && rnn.num_directions() == 1 && - rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){ - //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, - //weed them out - if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){ - if ((tensors.seq_length >=40 && bsize <=128) || - (tensors.seq_length >=20 && bsize <=96) || - (tensors.seq_length >=10 && bsize <=32)) { - return CUDNN_RNN_ALGO_PERSIST_STATIC; - } - } - } + inline bool use_persist_common_heuristics(const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + return rnn.num_layers == 1 && + rnn.hidden_size <= 1024 && + rnn.num_directions() == 1 && + rnn.hidden_size % 128 == 0 && + tensors.input_size % 128 == 0; + } + + inline bool use_persist_device_heuristics(const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + auto bsize = tensors.mini_batch; + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + if (prop->major == 7) { + if (prop->minor == 5) { + // Excludes Turing from using persistent rnn. + return false; + } else { + // technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, + // weed them out + return ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8) && + ((tensors.seq_length >=40 && bsize <=128) || + (tensors.seq_length >=20 && bsize <=96) || + (tensors.seq_length >=10 && bsize <=32)); + } + } else if (prop->major >= 8) { + // Based on tests by Vasily Volkov and xwang233. Vasily only tried bsize <= 128, + // so conservatively enable persistence for bsize <= 128 only. + // TODO: Run more tests for bsize > 128. + if (rnn.mode == CUDNN_GRU) { + // Persistent GRU performance is flakier than other RNN types. Exclude them for now. + // TODO: Write a more refined GRU heuristic. + return false; + } else if (rnn.mode == CUDNN_LSTM) { + // Persistent LSTMs are comparable to or better than non-persistent for bsize <= 128. + return bsize <= 128; + } else { + // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND hidden size >= 896. + return (bsize <= 128) && (bsize < 96 || rnn.hidden_size < 896); } - return CUDNN_RNN_ALGO_STANDARD; + } else { + return false; + } + } + + cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input) { + if (getCudnnDataType(input) == CUDNN_DATA_HALF && + !tensors.is_input_packed()) { + if (use_persist_common_heuristics(rnn, tensors) && + use_persist_device_heuristics(rnn, tensors)) { + return CUDNN_RNN_ALGO_PERSIST_STATIC; + } + } + return CUDNN_RNN_ALGO_STANDARD; } cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) { diff --git a/aten/src/ATen/native/group_norm.cpp b/aten/src/ATen/native/group_norm.cpp index 229e54a9ce62..beb4d940363e 100644 --- a/aten/src/ATen/native/group_norm.cpp +++ b/aten/src/ATen/native/group_norm.cpp @@ -133,5 +133,29 @@ Tensor group_norm( DEFINE_DISPATCH(GroupNormKernel); DEFINE_DISPATCH(GroupNormBackwardKernel); +std::tuple math_group_norm( + const at::Tensor& input, const at::Tensor& weight, + const at::Tensor& bias, int64_t N, int64_t C, int64_t HxW, + int64_t group, double eps) { + auto input_shape = input.sizes(); + at::Tensor input_reshaped = input.view({1, N * group, N ? -1 : 1}); + auto outputs = at::native_batch_norm( + input_reshaped, /*weight=*/{}, /*bias=*/{}, /*running_mean=*/{}, + /*running_var=*/{}, /*training=*/true, /*momentum=*/0, eps); + at::Tensor out = std::get<0>(outputs); + out = out.view(input_shape); + std::vector affine_param_shape(input.dim(), 1); + affine_param_shape[1] = C; + if (weight.defined() && bias.defined()) { + out = bias.view(affine_param_shape).addcmul(out, weight.view(affine_param_shape), 1); + } else if (weight.defined()) { + out = out.mul(weight.view(affine_param_shape)); + } else if (bias.defined()) { + out = out.add(bias.view(affine_param_shape)); + } + at::Tensor mean = std::get<1>(outputs).view({N, group}); + at::Tensor rstd = std::get<2>(outputs).view({N, group}); + return std::make_tuple(out, mean, rstd); +} } // namespace native } // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 6541e45b3230..c27cb4083ac2 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -167,13 +167,13 @@ - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) variants: function dispatch: - CUDA: fused_dropout_cuda + CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor use_c10_dispatcher: full variants: function dispatch: - CUDA: masked_scale_cuda + CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) use_c10_dispatcher: full @@ -226,6 +226,8 @@ variants: function, method - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: abs_out # Note [Adding an alias] # To add an alias do the following: @@ -268,6 +270,8 @@ variants: function, method - func: angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: angle_out - func: view_as_real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full @@ -277,6 +281,17 @@ use_c10_dispatcher: full variants: function +- func: sgn(Tensor self) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: sgn_(Tensor(a!) self) -> Tensor(a!) + variants: method + +- func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sgn_out + - func: real(Tensor(a) self) -> Tensor(a) use_c10_dispatcher: full variants: function @@ -290,6 +305,8 @@ variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: conj_out - func: _conj(Tensor self) -> Tensor use_c10_dispatcher: full @@ -304,6 +321,8 @@ variants: function, method - func: acos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acos_out # arccos, alias of acos - func: arccos(Tensor self) -> Tensor @@ -378,12 +397,18 @@ - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: addmv - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: addmv_ - func: addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addmv_out - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) use_c10_dispatcher: full @@ -412,8 +437,12 @@ - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: all - func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -427,8 +456,12 @@ - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: any - func: any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor variants: function, method @@ -480,6 +513,8 @@ variants: function, method - func: acosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: acosh_out # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor @@ -501,6 +536,8 @@ variants: function, method - func: asinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: asinh_out # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor @@ -522,6 +559,8 @@ variants: function, method - func: atanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atanh_out # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor @@ -553,8 +592,14 @@ - func: asin_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: asin_ + SparseCPU, SparseCUDA: asin_sparse_ - func: asin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: asin_out + SparseCPU, SparseCUDA: asin_out_sparse # arcsin, alias of asin - func: arcsin(Tensor self) -> Tensor @@ -576,6 +621,8 @@ variants: function, method - func: atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan_out # arctan, alias of atan - func: arctan(Tensor self) -> Tensor @@ -661,12 +708,18 @@ - func: bernoulli.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: bernoulli_out - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: bernoulli_ # This out-of-place version isn't used explicitly, but needed by jit. # There is no default valid on `p` here because it would introduce ambiguity @@ -871,6 +924,8 @@ variants: function, method - func: clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_out - func: clamp_max(Tensor self, Scalar max) -> Tensor use_c10_dispatcher: full @@ -881,6 +936,8 @@ variants: function, method - func: clamp_max.out(Tensor self, Scalar max, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_max_out - func: clamp_min(Tensor self, Scalar min) -> Tensor use_c10_dispatcher: full @@ -891,6 +948,8 @@ variants: function, method - func: clamp_min.out(Tensor self, Scalar min, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: clamp_min_out # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor @@ -911,12 +970,16 @@ variants: function - func: complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: complex_out - func: polar(Tensor abs, Tensor angle) -> Tensor use_c10_dispatcher: full variants: function - func: polar.out(Tensor abs, Tensor angle, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polar_out - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor use_c10_dispatcher: full @@ -990,6 +1053,8 @@ variants: function, method - func: cos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1000,6 +1065,8 @@ variants: function, method - func: cosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor use_c10_dispatcher: full @@ -1007,6 +1074,8 @@ - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: count_nonzero - func: count_nonzero(Tensor self, int? dim=None) -> Tensor use_c10_dispatcher: full @@ -1183,7 +1252,7 @@ - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) use_c10_dispatcher: full dispatch: - CPU: ctc_loss_cpu + CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor @@ -1449,6 +1518,8 @@ variants: function, method - func: erf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1459,6 +1530,8 @@ variants: function, method - func: erfc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1469,6 +1542,8 @@ variants: function, method - func: exp.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp_out - func: exp2(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1479,6 +1554,8 @@ variants: function, method - func: exp2.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor use_c10_dispatcher: full @@ -1593,6 +1670,8 @@ variants: function, method - func: frac.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: frac_out - func: full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor device_guard: False @@ -1611,6 +1690,8 @@ CPU: from_file - func: gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: gcd_out - func: gcd(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1621,6 +1702,8 @@ variants: function, method - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: lcm_out - func: lcm(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -1716,6 +1799,7 @@ use_c10_dispatcher: full dispatch: CPU, CUDA: native_group_norm + Math: math_group_norm - func: native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int N, int C, int HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -1759,6 +1843,8 @@ - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor variants: function, method + dispatch: + CPU, CUDA: index # NB: This function is special-cased in tools/autograd/gen_variable_type.py # NB: The following functions are declared in aten/src/ATen/templates/TensorBody.h and defined in aten/src/ATen/TensorIndexing.cpp: # - Tensor Tensor::index(ArrayRef indices) @@ -1791,6 +1877,8 @@ - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) variants: function + dispatch: + CPU, CUDA: _index_put_impl_ - func: instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor use_c10_dispatcher: full @@ -1893,6 +1981,16 @@ CPU: layer_norm_backward_cpu CUDA: layer_norm_backward_cuda +- func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor + use_c10_dispatcher: full + variants: function, method + +- func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) + use_c10_dispatcher: full + variants: function, method + +- func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!) + - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor use_c10_dispatcher: full python_module: nn @@ -1988,12 +2086,16 @@ CPU, CUDA: log2_out - func: logaddexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function - func: logaddexp2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -2086,6 +2188,8 @@ - func: matrix_exp(Tensor self) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: matrix_exp - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor use_c10_dispatcher: full @@ -2115,6 +2219,8 @@ variants: function, method - func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: max_out - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -2131,6 +2237,8 @@ variants: function, method - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amax_out # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) @@ -2152,6 +2260,11 @@ dispatch: MkldnnCPU: mkldnn_max_pool3d +- func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor + use_c10_dispatcher: full + dispatch: + QuantizedCPU: quantized_max_pool1d + - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor use_c10_dispatcher: full dispatch: @@ -2202,6 +2315,8 @@ variants: function, method - func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) + dispatch: + CPU, CUDA: min_out - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -2213,6 +2328,8 @@ variants: function, method - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: amin_out - func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor use_c10_dispatcher: full @@ -2333,6 +2450,8 @@ - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: mode - func: mode.values(Tensor self, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -2528,18 +2647,26 @@ - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_forward - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _cdist_backward - func: pdist(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full - func: _pdist_forward(Tensor self, float p=2) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_forward - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor use_c10_dispatcher: full + dispatch: + CPU, CUDA: _pdist_backward - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor use_c10_dispatcher: full @@ -2708,6 +2835,8 @@ variants: function, method - func: reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2716,8 +2845,14 @@ - func: neg_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: neg_ + SparseCPU, SparseCUDA: neg_sparse_ - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: neg_out + SparseCPU, SparseCUDA: neg_out_sparse # Alias for neg - func: negative(Tensor self) -> Tensor @@ -2835,10 +2970,14 @@ - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2888,6 +3027,8 @@ - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: silu_out - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -2909,6 +3050,8 @@ MkldnnCPU: mkldnn_sigmoid_ - func: sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sigmoid_out - func: logit(Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full @@ -2923,6 +3066,8 @@ CPU, CUDA: logit_ - func: logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor use_c10_dispatcher: full @@ -2945,6 +3090,8 @@ variants: function, method - func: sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sinh_out # Returns a copy of this `Variable` that is detached from its autograd graph. # This method is OK to call if the `Variable` is a view. @@ -3119,27 +3266,39 @@ - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sum_out - func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: nansum - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: nansum - func: nansum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nansum_out - func: sum_to_size(Tensor self, int[] size) -> Tensor use_c10_dispatcher: full @@ -3155,6 +3314,8 @@ variants: function, method - func: sqrt.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3167,23 +3328,33 @@ - func: std(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: std - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: std_mean - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: std_out - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -3193,12 +3364,18 @@ - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: prod - func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor variants: function, method @@ -3224,6 +3401,8 @@ variants: function, method - func: tan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor use_c10_dispatcher: full @@ -3237,6 +3416,8 @@ variants: function, method - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor use_c10_dispatcher: full @@ -3350,6 +3531,8 @@ variants: function, method - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: trunc_out # Alias for trunc - func: fix(Tensor self) -> Tensor @@ -3428,12 +3611,18 @@ - func: var(Tensor self, bool unbiased=True) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: function, method + dispatch: + CPU, CUDA: var - func: var.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor variants: function, method @@ -3443,10 +3632,14 @@ - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: var_mean - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) variants: function @@ -3482,6 +3675,8 @@ - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _s_where - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor use_c10_dispatcher: full @@ -3584,8 +3779,8 @@ - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor use_c10_dispatcher: full dispatch: - SparseCPU: _sparse_sum_backward_cpu - SparseCUDA: _sparse_sum_backward_cuda + SparseCPU: _sparse_sum_backward_cpu + SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full @@ -3598,11 +3793,13 @@ use_c10_dispatcher: full dispatch: SparseCPU: softmax_sparse_cpu + SparseCUDA: softmax_sparse_cuda - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor use_c10_dispatcher: full dispatch: SparseCPU: softmax_backward_sparse_cpu + SparseCUDA: softmax_backward_sparse_cuda - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor use_c10_dispatcher: full @@ -3615,11 +3812,13 @@ use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_sparse_cpu + SparseCUDA: log_softmax_sparse_cuda - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_backward_sparse_cpu + SparseCUDA: log_softmax_backward_sparse_cuda - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor use_c10_dispatcher: full @@ -3638,8 +3837,12 @@ variants: function, method - func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: norm_out - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor variants: function, method @@ -3748,6 +3951,8 @@ - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: rsub - func: heaviside.out(Tensor self, Tensor values, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -4197,6 +4402,8 @@ - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full @@ -4205,6 +4412,8 @@ - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_tensor_affine - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -4213,6 +4422,8 @@ - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: fake_quantize_per_channel_affine - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full @@ -4221,6 +4432,8 @@ - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor use_c10_dispatcher: full variants: function + dispatch: + CPU, CUDA: _fake_quantize_learnable_per_channel_affine - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) use_c10_dispatcher: full @@ -4234,6 +4447,10 @@ use_c10_dispatcher: full variants: function +- func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor) + use_c10_dispatcher: full + variants: function + # to(Device) must not exist because all constructors of Device also works for # TensorOptions. Otherwise, an ambiguity error is thrown. # See NOTE [ TensorOptions Constructors ]. @@ -4787,6 +5004,8 @@ - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) use_c10_dispatcher: full @@ -4805,6 +5024,8 @@ - func: digamma_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full variants: method + dispatch: + CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) use_c10_dispatcher: full @@ -4894,27 +5115,43 @@ - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: uniform_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: geometric_ # wrappers for TH functions @@ -4933,10 +5170,14 @@ device_guard: False - func: cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: cross_out - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: cross - func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5271,15 +5512,15 @@ - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: legacy::cpu::_th_take_out - CUDA: legacy::cuda::_th_take_out + CPU: take_out_cpu + CUDA: take_out_cuda - func: take(Tensor self, Tensor index) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: legacy::cpu::_th_take - CUDA: legacy::cuda::_th_take + CPU: take_cpu + CUDA: take_cuda - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor use_c10_dispatcher: full @@ -5368,6 +5609,8 @@ use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5378,6 +5621,8 @@ variants: method - func: addcdiv.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor use_c10_dispatcher: full @@ -5597,12 +5842,18 @@ CPU, CUDA: lgamma - func: digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: digamma - func: polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor use_c10_dispatcher: full @@ -5635,6 +5886,8 @@ variants: function, method - func: i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: i0_out - func: sign(Tensor self) -> Tensor use_c10_dispatcher: full @@ -5662,10 +5915,14 @@ variants: method, function - func: atan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: atan2 - func: lerp.Scalar_out(Tensor self, Tensor end, Scalar weight, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -5728,19 +5985,27 @@ CUDA: fmod_cuda - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: hypot_out - func: hypot(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: hypot - func: hypot_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method - func: nextafter.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: nextafter_out - func: nextafter(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: nextafter - func: nextafter_(Tensor(a!) self, Tensor other) -> Tensor(a!) variants: method @@ -5782,8 +6047,12 @@ - func: maximum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: maximum - func: maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: maximum_out # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max @@ -5796,8 +6065,12 @@ - func: minimum(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: minimum - func: minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: minimum_out # binary min, alias for minimum # NOTE: min is not an alias for minimum, since there is also unary min @@ -5878,6 +6151,8 @@ - func: all(Tensor self) -> Tensor use_c10_dispatcher: full variants: method, function + dispatch: + CPU, CUDA: all - func: any(Tensor self) -> Tensor use_c10_dispatcher: full @@ -5953,18 +6228,32 @@ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) variants: method + dispatch: + CPU, CUDA: normal_ - func: normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) + dispatch: + CPU, CUDA: normal_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor + dispatch: + CPU, CUDA: normal - func: normal.float_float(float mean, float std, int[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -5974,23 +6263,6 @@ use_c10_dispatcher: full variants: method, function -- func: _addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full - dispatch: - CPU: legacy::cpu::_th_addr - CUDA: addr_cuda - -- func: _addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full - dispatch: - CPU: legacy::cpu::_th_addr_ - CUDA: addr__cuda - -- func: _addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) - dispatch: - CPU: legacy::cpu::_th_addr_out - CUDA: addr_out_cuda - - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) use_c10_dispatcher: full dispatch: @@ -6029,11 +6301,12 @@ dispatch: CPU: legacy::cpu::_th_std -- func: _amp_non_finite_check_and_unscale_(Tensor(a!) self, Tensor(b!) found_inf, Tensor inv_scale) -> () +- func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () use_c10_dispatcher: full + device_guard: False variants: function dispatch: - CUDA: _amp_non_finite_check_and_unscale_cuda_ + CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor use_c10_dispatcher: full @@ -6063,6 +6336,7 @@ CUDA: foreach_tensor_add_scalar_kernel_cuda - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6070,6 +6344,7 @@ CUDA: foreach_tensor_add_scalar_kernel_cuda_ - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6077,6 +6352,7 @@ CUDA: foreach_tensor_sub_scalar_kernel_cuda - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6084,6 +6360,7 @@ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6091,6 +6368,7 @@ CUDA: foreach_tensor_mul_scalar_kernel_cuda - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6098,6 +6376,7 @@ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6105,34 +6384,39 @@ CUDA: foreach_tensor_div_scalar_kernel_cuda - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ -- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[] +- func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow CUDA: foreach_tensor_add_list_kernel_cuda -- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> () +- func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ -- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, Scalar alpha=1) -> Tensor[] +- func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow CUDA: foreach_tensor_sub_list_kernel_cuda -- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, Scalar alpha=1) -> () +- func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6140,6 +6424,7 @@ CUDA: foreach_tensor_sub_list_kernel_cuda_ - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6147,13 +6432,15 @@ CUDA: foreach_tensor_mul_list_kernel_cuda - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ -- func: _foreach_div.List(Tensor(a!)[] self, Tensor[] other) -> Tensor[] +- func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6161,13 +6448,79 @@ CUDA: foreach_tensor_div_list_kernel_cuda - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ +- func: _foreach_add_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow + CUDA: foreach_tensor_add_scalarlist_kernel_cuda + +- func: _foreach_add_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_add_scalarlist_kernel_slow_ + CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ + +- func: _foreach_sub_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda + +- func: _foreach_sub_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sub_scalarlist_kernel_slow_ + CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ + +- func: _foreach_div_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow + CUDA: foreach_tensor_div_scalarlist_kernel_cuda + +- func: _foreach_div_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_div_scalarlist_kernel_slow_ + CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ + +- func: _foreach_mul_scalar_list(Tensor[] tensors, float[] scalars) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda + +- func: _foreach_mul_scalar_list_(Tensor(a!)[] self, float[] scalars) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_mul_scalarlist_kernel_slow_ + CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ + - func: _foreach_exp(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6175,6 +6528,7 @@ CUDA: foreach_tensor_exp_cuda - func: _foreach_exp_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6182,6 +6536,7 @@ CUDA: foreach_tensor_exp_cuda_ - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6189,6 +6544,7 @@ CUDA: foreach_tensor_sqrt_cuda - func: _foreach_sqrt_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6196,6 +6552,7 @@ CUDA: foreach_tensor_sqrt_cuda_ - func: _foreach_addcdiv_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6203,6 +6560,7 @@ CUDA: foreach_tensor_addcdiv_cuda_ - func: _foreach_addcmul_(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6210,6 +6568,7 @@ CUDA: foreach_tensor_addcmul_cuda_ - func: _foreach_addcdiv(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6217,6 +6576,7 @@ CUDA: foreach_tensor_addcdiv_cuda - func: _foreach_addcmul(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] + use_c10_dispatcher: full device_guard: False variants: function dispatch: @@ -6272,10 +6632,14 @@ - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6429,23 +6793,25 @@ CPU: nll_loss2d_backward_cpu CUDA: legacy::cuda::_thnn_nll_loss2d_backward -- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) +- func: smooth_l1_loss.out(Tensor self, Tensor target, int reduction=Mean, float beta=1.0, *, Tensor(a!) out) -> Tensor(a!) python_module: nn dispatch: CPU: smooth_l1_loss_out CUDA: smooth_l1_loss_out -- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor +- func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: smooth_l1_loss -- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) +- func: smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn dispatch: CPU: smooth_l1_loss_backward_out CUDA: smooth_l1_loss_backward_out -- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor +- func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor use_c10_dispatcher: full python_module: nn @@ -6465,10 +6831,14 @@ - func: elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu - func: elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6478,6 +6848,8 @@ - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6511,6 +6883,8 @@ - func: hardsigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor use_c10_dispatcher: full @@ -6522,6 +6896,8 @@ - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6550,6 +6926,8 @@ - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) use_c10_dispatcher: full @@ -6560,14 +6938,20 @@ - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor use_c10_dispatcher: full @@ -6586,11 +6970,13 @@ python_module: nn dispatch: CPU, CUDA: leaky_relu - QuantizedCPU: heaky_relu_quantized_cpu + QuantizedCPU: leaky_relu_quantized_cpu - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) use_c10_dispatcher: full @@ -6656,10 +7042,14 @@ - func: softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus - func: softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6669,13 +7059,19 @@ - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softplus_backward - func: softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!) python_module: nn + dispatch: + CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink - func: softshrink_backward.grad_input(Tensor grad_output, Tensor self, Scalar lambd, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -6685,6 +7081,8 @@ - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: softshrink_backward - func: adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!) python_module: nn @@ -7447,6 +7845,8 @@ - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: sigmoid_backward - func: logit_backward.grad_input(Tensor grad_output, Tensor self, float? eps=None, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7456,6 +7856,8 @@ - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: logit_backward - func: tanh_backward.grad_input(Tensor grad_output, Tensor output, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn @@ -7465,6 +7867,8 @@ - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor use_c10_dispatcher: full python_module: nn + dispatch: + CPU, CUDA: tanh_backward # What's a thnn_conv_ versus a slow_conv_? # @@ -7787,6 +8191,46 @@ use_c10_dispatcher: full variants: function +- func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor use_c10_dispatcher: full variants: function, method diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 5efec6420906..8ae92a0d3bec 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -232,5 +232,96 @@ std::tuple _choose_qparams_per_tensor( return std::make_tuple(q_params.scale, q_params.zero_point); } +float calculate_quant_loss( + const float* input, + int numel, + float xmin, + float xmax, + float* q_input, + int bit_width) { + xmin = static_cast(xmin); + float data_range = xmax - xmin; + float qmax = (1 << bit_width) - 1; + float scale = data_range == 0 + ? 1.0 + : static_cast(static_cast(data_range / qmax)); + float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale; + + float norm = 0.0f; + int i = 0; + + // TODO add FBGEMM kernel + // #ifdef USE_FBGEMM + // #endif + + // remainder loop + for (; i < numel; i++) { + q_input[i] = std::max( + 0.0f, std::min(nearbyint((input[i] - xmin) * inverse_scale), qmax)); + q_input[i] = q_input[i] * scale + xmin; + norm += (input[i] - q_input[i]) * (input[i] - q_input[i]); + } + return std::sqrt(norm); +} + +/* + Helper function to find the best min/max for a tensor to calculate qparams. + It uses a greedy approach to nudge the min and max and calculate the l2 norm + and tries to minimize the quant error by doing `torch.norm(x-fake_quant(x,s,z))` + Returns the optimized xmax and xmin value of the tensor. +*/ +std::tuple choose_qparams_optimized( + const at::Tensor& input_tensor, + int64_t numel, + const int64_t n_bins, + const double ratio, + int64_t bit_width) { + + const float* input_row = input_tensor.data_ptr(); + float xmin = *std::min_element(input_row, input_row + numel); + float xmax = *std::max_element(input_row, input_row + numel); + + float stepsize = (xmax - xmin) / n_bins; + int min_bins = n_bins * (1.0 - (float) ratio); + const float* input = input_tensor.contiguous().data_ptr(); + std::vector q_input(numel); + + float loss = + calculate_quant_loss(input, numel, xmin, xmax, q_input.data(), bit_width); + float best_loss = loss; + + float cur_min = xmin; + float cur_max = xmax; + float cur_loss = loss; + + float thr = min_bins * stepsize; + while (cur_min + thr < cur_max) { + // move left + float loss1 = calculate_quant_loss( + input, numel, cur_min + stepsize, cur_max, q_input.data(), bit_width); + // move right + float loss2 = calculate_quant_loss( + input, numel, cur_min, cur_max - stepsize, q_input.data(), bit_width); + if (cur_loss < loss1 && cur_loss < loss2 && cur_loss < best_loss) { + // found a local optima + best_loss = cur_loss; + xmin = cur_min; + xmax = cur_max; + } + if (loss1 < loss2) { + cur_min = cur_min + stepsize; + cur_loss = loss1; + } else { + cur_max = cur_max - stepsize; + cur_loss = loss2; + } + } + + at::Tensor xmax_tensor = at::empty({1}); + at::Tensor xmin_tensor = at::empty({1}); + xmax_tensor[0] = xmax; + xmin_tensor[0] = xmin; + return std::make_tuple(xmax_tensor, xmin_tensor); +} } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/affine_quantizer.cpp b/aten/src/ATen/native/quantized/affine_quantizer.cpp index cbf116d741e3..1d0aed1174aa 100644 --- a/aten/src/ATen/native/quantized/affine_quantizer.cpp +++ b/aten/src/ATen/native/quantized/affine_quantizer.cpp @@ -17,6 +17,8 @@ DEFINE_DISPATCH(quantize_tensor_per_channel_float_qparams_stub); DEFINE_DISPATCH(dequantize_tensor_per_tensor_affine_stub); DEFINE_DISPATCH(dequantize_tensor_per_channel_affine_stub); DEFINE_DISPATCH(dequantize_tensor_per_channel_float_qparams_stub); +DEFINE_DISPATCH(quantize_tensor_per_tensor_affine_sub_byte_stub); +DEFINE_DISPATCH(dequantize_tensor_per_tensor_affine_sub_byte_stub); namespace { @@ -55,7 +57,8 @@ void checkQuantizedTensor(const std::string& fn_name, Tensor t) { fn_name, " expects a ", caffe2::TypeMeta::Make(), - " Tensor"); + " Tensor, got ", + t.scalar_type()); } template @@ -103,13 +106,21 @@ Tensor quantize_tensor_per_tensor_affine( checkSameDevice(fn_name, rtensor, qtensor); checkSameSize(fn_name, qtensor, rtensor); - AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() { + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() { checkQuantizedTensor(fn_name, qtensor); checkZeroPoint(fn_name, zero_point); }); - quantize_tensor_per_tensor_affine_stub( + // Temporary solution to pack the tensor if dtype is torch.quint4x2 + // Can move this into the fbgemm::Quantize op. + if (qtensor.scalar_type() == at::ScalarType::QUInt4x2) { + quantize_tensor_per_tensor_affine_sub_byte_stub( + rtensor.device().type(), rtensor, qtensor, scale, zero_point); + } + else { + quantize_tensor_per_tensor_affine_stub( rtensor.device().type(), rtensor, qtensor, scale, zero_point); + } return qtensor; } @@ -163,7 +174,7 @@ Tensor quantize_tensor_per_channel_float_qparams( checkSameDevice(fn_name, rtensor, qtensor); checkSameSize(fn_name, qtensor, rtensor); - AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() { + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() { checkQuantizedTensor(fn_name, qtensor); }); @@ -195,13 +206,18 @@ Tensor dequantize_tensor_per_tensor_affine( checkSameDevice(fn_name, rtensor, qtensor); checkSameSize(fn_name, qtensor, rtensor); - AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() { + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() { checkQuantizedTensor(fn_name, qtensor); checkZeroPoint(fn_name, zero_point); }); - dequantize_tensor_per_tensor_affine_stub( - qtensor.device().type(), qtensor, rtensor, scale, zero_point); + if (qtensor.scalar_type() == at::ScalarType::QUInt4x2) { + dequantize_tensor_per_tensor_affine_sub_byte_stub( + qtensor.device().type(), qtensor, rtensor, scale, zero_point); + } else { + dequantize_tensor_per_tensor_affine_stub( + qtensor.device().type(), qtensor, rtensor, scale, zero_point); + } return rtensor; } @@ -253,7 +269,7 @@ Tensor dequantize_tensor_per_channel_float_qparams( checkSameDevice(fn_name, rtensor, qtensor); checkSameSize(fn_name, qtensor, rtensor); - AT_DISPATCH_QINT_TYPES(qtensor.scalar_type(), fn_name, [&]() { + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(qtensor.scalar_type(), fn_name, [&]() { checkQuantizedTensor(fn_name, qtensor); }); @@ -394,17 +410,13 @@ CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value) { * Note: For the case of embedding quantization we will set zero_point * to (-Xmin/scale), where Xmin is the min value in input tensor row. */ -template -T quantize_val_float_qparams(float scale, float zero_point, float value) { - int64_t qvalue; +int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax) { + int qvalue; - // TODO make sure qmax and qmin for dtypes other than int8, uint8 is correctly defined. - constexpr int64_t qmin = std::numeric_limits::min(); - constexpr int64_t qmax = std::numeric_limits::max(); float inv_scale = scale == 0 ? 1.0f : 1.0f / scale; qvalue = lrintf(value * inv_scale + zero_point); qvalue = std::max(qmin, std::min(qvalue, qmax)); - return static_cast(qvalue); + return qvalue; } template @@ -491,11 +503,5 @@ requantize_from_int(double, int64_t, int64_t); template CAFFE2_API qint32 requantize_from_int(double, int64_t, int64_t); -template CAFFE2_API qint8 -quantize_val_float_qparams(float scale, float zero_point, float value); -template CAFFE2_API quint8 -quantize_val_float_qparams(float scale, float zero_point, float value); -template CAFFE2_API qint32 -quantize_val_float_qparams(float scale, float zero_point, float value); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/affine_quantizer.h b/aten/src/ATen/native/quantized/affine_quantizer.h index 862a36f5f61a..670b119652cd 100644 --- a/aten/src/ATen/native/quantized/affine_quantizer.h +++ b/aten/src/ATen/native/quantized/affine_quantizer.h @@ -77,6 +77,12 @@ using dequantize_tensor_per_channel_float_qparams_fn = void (*)( Tensor zero_points, int64_t axis); +using quantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(Tensor rtensor, Tensor qtensor, float scale, float zero_point); + +using dequantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(Tensor qtensor, Tensor rtensor, float scale, float zero_point); + DECLARE_DISPATCH( quantize_tensor_per_tensor_affine_fn, quantize_tensor_per_tensor_affine_stub); @@ -97,6 +103,13 @@ DECLARE_DISPATCH( dequantize_tensor_per_channel_float_qparams_fn, dequantize_tensor_per_channel_float_qparams_stub); +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_sub_byte_fn, + quantize_tensor_per_tensor_affine_sub_byte_stub); + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_sub_byte_fn, + dequantize_tensor_per_tensor_affine_sub_byte_stub); // Quantize a float value into a uint value given scale and zero_point template @@ -145,8 +158,7 @@ template CAFFE2_API DST_T requantize_from_int(double multiplier, int64_t zero_point, int64_t src); -template -CAFFE2_API T quantize_val_float_qparams(float scale, float zero_point, float value); +int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp index 65036302e6ef..29e7a9b259bb 100644 --- a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp +++ b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp @@ -10,17 +10,29 @@ namespace native { // format of the output the same as input Tensor int_repr_quantized_cpu(const Tensor& self) { Tensor dst; - AT_DISPATCH_QINT_TYPES(self.scalar_type(), "int_repr", [&]() { - dst = at::empty( - self.sizes(), - self.options().dtype(UNDERLYING_TYPE), - self.suggest_memory_format()); - auto iter = TensorIteratorConfig() - .check_all_same_dtype(false) - .add_output(dst) - .add_input(self) - .build(); - cpu_kernel(iter, [](scalar_t value) -> underlying_t { return value.val_; }); + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES(self.scalar_type(), "int_repr", [&]() { + if (bit_width == 4) { + int64_t out_size = std::ceil(self.numel() * 0.5); + dst = at::empty( + {out_size}, + self.options().dtype(UNDERLYING_TYPE), + self.suggest_memory_format()); + const underlying_t* qdata = reinterpret_cast(self.data_ptr()); + for (int64_t i = 0; i < dst.numel(); ++i) { + dst[i] = static_cast(qdata[i]); + } + } else { + dst = at::empty( + self.sizes(), + self.options().dtype(UNDERLYING_TYPE), + self.suggest_memory_format()); + auto iter = TensorIteratorConfig() + .check_all_same_dtype(false) + .add_output(dst) + .add_input(self) + .build(); + cpu_kernel(iter, [](scalar_t value) -> underlying_t { return value.val_; }); + } }); return dst; } diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index ddde74b61d52..a65e9f00f1d8 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -2592,7 +2592,8 @@ void dequantize_per_channel_affine_kernel( Tensor rtensor, Tensor scales, Tensor zero_points, - int64_t axis) { + int64_t axis, + int bit_width=8) { // For contiguous tensors, e.g. NCHW, arbitrary axis can be used. // For channels_last/3d however axis == 0 or 1. @@ -2611,6 +2612,7 @@ void dequantize_per_channel_affine_kernel( check_tensor_memory_format(qtensor, rtensor); const auto* qd = qtensor.data_ptr(); float* rd = rtensor.data_ptr(); + const auto elem_per_byte = 8 / bit_width; if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) || rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) { for (auto b = 0; b < batches; ++b) { @@ -2619,8 +2621,12 @@ void dequantize_per_channel_affine_kernel( auto i = b * channel * elements_per_channel + e * channel + c; // We need to convert the qint8 value to float to ensure the // subtraction subexpression returns a float - rd[i] = (static_cast(qd[i].val_) - zero_points_data[c]) * - scales_data[c]; + auto qvalue = qd[i / elem_per_byte].val_; + if (bit_width < 8) { + qvalue >>= (i % elem_per_byte) * bit_width; + qvalue &= (1 << bit_width) - 1; + } + rd[i] = (static_cast(qvalue) - zero_points_data[c]) * scales_data[c]; } } } @@ -2632,8 +2638,12 @@ void dequantize_per_channel_affine_kernel( c * elements_per_channel + e; // We need to convert the qint8 value to float to ensure the // subtraction subexpression returns a float - rd[i] = (static_cast(qd[i].val_) - zero_points_data[c]) * - scales_data[c]; + auto qvalue = qd[i / elem_per_byte].val_; + if (bit_width < 8) { + qvalue >>= (i % elem_per_byte) * bit_width; + qvalue &= (1 << bit_width) - 1; + } + rd[i] = (static_cast(qvalue) - zero_points_data[c]) * scales_data[c]; } } } @@ -2667,7 +2677,7 @@ void quantize_tensor_per_channel_float_qparams_cpu( TORCH_CHECK(rtensor.is_contiguous() || (axis <=1), "If tensor is channels_last contig then per channel quantization " "is supported only for axis = 0 or 1."); - AT_DISPATCH_QINT_TYPES( + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES( qtensor.scalar_type(), "quantize_tensor_per_channel_float_qparams_cpu", [&]() { int64_t batches = size_to_dim_(axis, rtensor.sizes()); int64_t elements_per_channel = @@ -2677,15 +2687,22 @@ void quantize_tensor_per_channel_float_qparams_cpu( auto zero_points_data = zero_points.data_ptr(); check_tensor_memory_format(rtensor, qtensor); const float* rdata = rtensor.data_ptr(); - auto qdata = qtensor.data_ptr(); + auto qdata = reinterpret_cast(qtensor.data_ptr()); + const auto elem_per_byte = CHAR_BIT / bit_width; + int qvalue = 0; if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) || rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) { for (auto b = 0; b < batches; ++b) { for (auto e = 0; e < elements_per_channel; ++e) { for (auto c = 0; c < channel; ++c) { auto i = b * channel * elements_per_channel + e * channel + c; - qdata[i] = quantize_val_float_qparams( - scales_data[c], zero_points_data[c], rdata[i]); + qvalue = quantize_val_float_qparams( + scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max); + if (i % elem_per_byte == 0) { + qdata[i / elem_per_byte] = static_cast(qvalue); + } else { + qdata[i / elem_per_byte] |= static_cast(qvalue << ((i % elem_per_byte) * bit_width)); + } } } } @@ -2695,8 +2712,13 @@ void quantize_tensor_per_channel_float_qparams_cpu( for (auto e = 0; e < elements_per_channel; ++e) { auto i = b * channel * elements_per_channel + c * elements_per_channel + e; - qdata[i] = quantize_val_float_qparams( - scales_data[c], zero_points_data[c], rdata[i]); + qvalue = quantize_val_float_qparams( + scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max); + if (i % elem_per_byte == 0) { + qdata[i / elem_per_byte] = static_cast(qvalue); + } else { + qdata[i / elem_per_byte] |= static_cast(qvalue << ((i % elem_per_byte) * bit_width)); + } } } } @@ -2710,12 +2732,66 @@ void dequantize_tensor_per_channel_float_qparams_cpu( Tensor scales, Tensor zero_points, int64_t axis) { - AT_DISPATCH_QINT_TYPES( + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES( qtensor.scalar_type(), "dequantize_tensor_per_channel_float_qparams_cpu", [&]() { - dequantize_per_channel_affine_kernel(qtensor, rtensor, scales, zero_points, axis); + dequantize_per_channel_affine_kernel(qtensor, rtensor, scales, zero_points, axis, bit_width); }); } +void quantize_tensor_per_tensor_affine_sub_byte_cpu( + Tensor rtensor, + Tensor qtensor, + float scale, + float zero_point) { + // TODO Use fbgemm kernel to pack values + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES( + qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() { + check_tensor_memory_format(rtensor, qtensor); + const float* const rdata = rtensor.data_ptr(); + auto qdata = reinterpret_cast(qtensor.data_ptr()); + auto numel = rtensor.numel(); + const auto elem_per_byte = CHAR_BIT / bit_width; + for (int i = 0; i < numel; ++i) { + float inv_scale = scale == 0 ? 1.0f : 1.0f / scale; + int qvalue = lrintf(std::nearbyint(rdata[i] * inv_scale) + zero_point); + qvalue = std::max(quant_min, std::min(qvalue, quant_max)); + + // We pack sub_byte values and align them to a byte. + // Eg. for 4-bits Index 0 is packed in the lower 4-bits + // and index 1 is packed in the upper 4-bits. + if (i % elem_per_byte == 0) { + qdata[i / elem_per_byte] = static_cast(qvalue); + } else { + qdata[i / elem_per_byte] |= static_cast(qvalue << ((i % elem_per_byte) * bit_width)); + } + } // for numel + }); +} + +void dequantize_tensor_per_tensor_affine_sub_byte_cpu( + Tensor qtensor, + Tensor rtensor, + float scale, + float zero_point) { + // TODO Use fbgemm kernel to pack values + AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES( + qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() { + check_tensor_memory_format(rtensor, qtensor); + auto rdata = rtensor.data_ptr(); + const underlying_t* qdata = reinterpret_cast(qtensor.data_ptr()); + auto numel = rtensor.numel(); + const auto elem_per_byte = CHAR_BIT / bit_width; + + for (int i = 0; i < numel; ++i) { + underlying_t qvalue = qdata[i / elem_per_byte]; + qvalue >>= (i % elem_per_byte) * bit_width; + qvalue &= (1 << bit_width) - 1; + rdata[i] = (static_cast(qvalue) - zero_point) * scale; + } + }); + +} + } // namespace REGISTER_DISPATCH(dequantize_tensor_per_channel_affine_stub, @@ -2773,6 +2849,13 @@ REGISTER_DISPATCH( REGISTER_DISPATCH(quantized_normalize_stub, &quantized_normalize_kernel); REGISTER_DISPATCH(qupsample_bilinear2d_nhwc_stub, &qupsample_bilinear2d_nhwc_kernel); +REGISTER_DISPATCH( + quantize_tensor_per_tensor_affine_sub_byte_stub, + &quantize_tensor_per_tensor_affine_sub_byte_cpu); +REGISTER_DISPATCH( + dequantize_tensor_per_tensor_affine_sub_byte_stub, + &dequantize_tensor_per_tensor_affine_sub_byte_cpu); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/quantized/cpu/qadd.cpp b/aten/src/ATen/native/quantized/cpu/qadd.cpp index 22db20eeedb6..a12718502dd1 100644 --- a/aten/src/ATen/native/quantized/cpu/qadd.cpp +++ b/aten/src/ATen/native/quantized/cpu/qadd.cpp @@ -266,29 +266,29 @@ Tensor qadd_scalar_tensor_out(Tensor qa, Tensor b, Tensor out) { } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("add", TORCH_FN(qadd)); - m.impl("add.out", TORCH_FN(qadd_out)); - m.impl("add.Scalar", TORCH_FN(qadd_scalar)); - m.impl("add.Scalar_out", TORCH_FN(qadd_scalar_out)); - m.impl("add_relu", TORCH_FN(qadd)); - m.impl("add_relu.out", TORCH_FN(qadd_out)); - m.impl("add_relu.Scalar", TORCH_FN(qadd_scalar)); - m.impl("add_relu.Scalar_out", TORCH_FN(qadd_scalar_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add"), TORCH_FN(qadd)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add.out"), TORCH_FN(qadd_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar"), TORCH_FN(qadd_scalar)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add.Scalar_out"), TORCH_FN(qadd_scalar_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu"), TORCH_FN(qadd)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.out"), TORCH_FN(qadd_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar"), TORCH_FN(qadd_scalar)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu.Scalar_out"), TORCH_FN(qadd_scalar_out)); // deprecated functions, kept for backward compatibility - m.impl("add_out", TORCH_FN(qadd_out)); - m.impl("add_relu_out", TORCH_FN(qadd_out)); - m.impl("add_scalar", TORCH_FN(qadd_scalar)); - m.impl("add_scalar_relu", TORCH_FN(qadd_scalar)); - m.impl("add_scalar_out", TORCH_FN(qadd_scalar_out)); - m.impl("add_scalar_relu_out", TORCH_FN(qadd_scalar_out)); - m.impl("add_scalar.Tensor", TORCH_FN(qadd_scalar_tensor)); - m.impl("add_scalar_relu.Tensor", TORCH_FN(qadd_scalar_tensor)); - m.impl("add_scalar_out.Tensor", TORCH_FN(qadd_scalar_tensor_out)); - m.impl("add_scalar_relu_out.Tensor", TORCH_FN(qadd_scalar_tensor_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_out"), TORCH_FN(qadd_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_relu_out"), TORCH_FN(qadd_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar"), TORCH_FN(qadd_scalar)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu"), TORCH_FN(qadd_scalar)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out"), TORCH_FN(qadd_scalar_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out"), TORCH_FN(qadd_scalar_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar.Tensor"), TORCH_FN(qadd_scalar_tensor)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu.Tensor"), TORCH_FN(qadd_scalar_tensor)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::add_scalar_relu_out.Tensor"), TORCH_FN(qadd_scalar_tensor_out)); } TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) { - m.impl("add", TORCH_FN(qadd)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::add"), TORCH_FN(qadd)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp index effafcacc76e..b053940abba2 100644 --- a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp +++ b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp @@ -378,14 +378,14 @@ Tensor quantized_batch_norm( } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("batch_norm", TORCH_FN(q_batch_norm_impl)); - m.impl("batch_norm_relu", TORCH_FN(q_batch_norm_impl)); - m.impl("batch_norm1d", TORCH_FN(q_batch_norm1d_impl)); - m.impl("batch_norm1d_relu", TORCH_FN(q_batch_norm1d_impl)); - m.impl("batch_norm2d", TORCH_FN(q_batch_norm2d_impl)); - m.impl("batch_norm2d_relu", TORCH_FN(q_batch_norm2d_impl)); - m.impl("batch_norm3d", TORCH_FN(q_batch_norm3d_impl)); - m.impl("batch_norm3d_relu", TORCH_FN(q_batch_norm3d_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm"), TORCH_FN(q_batch_norm_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm_relu"), TORCH_FN(q_batch_norm_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm1d"), TORCH_FN(q_batch_norm1d_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm1d_relu"), TORCH_FN(q_batch_norm1d_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm2d"), TORCH_FN(q_batch_norm2d_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm2d_relu"), TORCH_FN(q_batch_norm2d_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d"), TORCH_FN(q_batch_norm3d_impl)); + m.impl(TORCH_SELECTIVE_NAME("quantized::batch_norm3d_relu"), TORCH_FN(q_batch_norm3d_impl)); } } // namespace native diff --git a/aten/src/ATen/native/quantized/cpu/qclamp.cpp b/aten/src/ATen/native/quantized/cpu/qclamp.cpp index a70016307785..3a8b647d320f 100644 --- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp +++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp @@ -140,7 +140,7 @@ Tensor& hardtanh_quantized_cpu_( } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("clamp", TORCH_FN(clamp_quantized_cpu)); + m.impl(TORCH_SELECTIVE_NAME("quantized::clamp"), TORCH_FN(clamp_quantized_cpu)); } } // namespace native diff --git a/aten/src/ATen/native/quantized/cpu/qconcat.cpp b/aten/src/ATen/native/quantized/cpu/qconcat.cpp index 0656f40e3554..ca08c365d83d 100644 --- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp @@ -102,10 +102,10 @@ Tensor qcat_out(const c10::List& qxs, int64_t dim, Tensor out) { } // namespace TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("cat", TORCH_FN(qcat)); - m.impl("cat_relu", TORCH_FN(qcat)); - m.impl("cat_out", TORCH_FN(qcat_out)); - m.impl("cat_relu_out", TORCH_FN(qcat_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::cat"), TORCH_FN(qcat)); + m.impl(TORCH_SELECTIVE_NAME("quantized::cat_relu"), TORCH_FN(qcat)); + m.impl(TORCH_SELECTIVE_NAME("quantized::cat_out"), TORCH_FN(qcat_out)); + m.impl(TORCH_SELECTIVE_NAME("quantized::cat_relu_out"), TORCH_FN(qcat_out)); } Tensor cat_quantized_cpu(TensorList qxs, int64_t dim) { diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 12563eb36d44..cb232a5d20c3 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -277,6 +277,12 @@ at::Tensor PackedConvWeight::apply_impl( : "quantized::conv"; TORCH_CHECK( fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + TORCH_CHECK( + !transpose(), + "FBGEMM currently does NOT support transposed convolution. ", + "Meanwhile you have multiple options: 1) Replace the ConvTranspose with ", + "the 'dequant->conv_tranpose->quant'; 2) Change the current qengine to " + "QNNPACK using 'torch.backends.quantized.engine = \"qnnpack\"'."); ConvDimChecks( act.ndimension(), stride().size(), padding().size(), output_padding().size(), dilation().size(), func_name, transpose()); @@ -850,30 +856,30 @@ class QConvInt8ForBC final { }; TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("conv1d", QConv1dInt8::run); - m.impl("conv1d_relu", QConv1dInt8::run); - m.impl("conv2d.new", QConvInt8<2, false>::run); - m.impl("conv2d_relu.new", QConvInt8<2, true>::run); - m.impl("conv3d.new", QConvInt8<3, false>::run); - m.impl("conv3d_relu.new", QConvInt8<3, true>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d"), QConv1dInt8::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_relu"), QConv1dInt8::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d.new"), QConvInt8<2, false>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d.new"), QConvInt8<3, false>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu.new"), QConvInt8<3, true>::run); // for backward compatibility - m.impl("conv2d", QConvInt8ForBC<2, false>::run); - m.impl("conv2d_relu", QConvInt8ForBC<2, true>::run); - m.impl("conv3d", QConvInt8ForBC<3, false>::run); - m.impl("conv3d_relu", QConvInt8ForBC<3, true>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d"), QConvInt8ForBC<2, false>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu"), QConvInt8ForBC<2, true>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d"), QConvInt8ForBC<3, false>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_relu"), QConvInt8ForBC<3, true>::run); // transpose - m.impl("conv_transpose1d", QConv1dInt8::run); - m.impl("conv_transpose2d", QConvInt8<2, false>::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d"), QConv1dInt8::run); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d"), QConvInt8<2, false>::run); } TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) { - m.impl("conv2d", QConvInt8<2, false>::run); - m.impl("conv2d_relu", QConvInt8<2, true>::run); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d"), QConvInt8<2, false>::run); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d_relu"), QConvInt8<2, true>::run); // transpose - m.impl("conv_transpose1d", QConv1dInt8::run); - m.impl("conv_transpose2d", QConvInt8<2, false>::run); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d"), QConv1dInt8::run); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d"), QConvInt8<2, false>::run); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index 4387b255dfe1..7bf84c9d5646 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -415,21 +415,21 @@ class QConv1dPackWeightInt8 final { TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { // Conv // conv_prepack is deprecated, please use conv2d_prepack for 2D conv. - m.impl("conv_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv)); - m.impl("conv1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_conv)); - m.impl("conv2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv)); - m.impl("conv3d_prepack", TORCH_FN(QConvPackWeightInt8<3>::run_conv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_conv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_prepack"), TORCH_FN(QConvPackWeightInt8<3>::run_conv)); // ConvTranspose - m.impl("conv_transpose1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_deconv)); - m.impl("conv_transpose2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_deconv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv)); } TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) { // Conv - m.impl("conv2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_conv)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_conv)); // ConvTranspose - m.impl("conv_transpose1d_prepack", TORCH_FN(QConv1dPackWeightInt8::run_deconv)); - m.impl("conv_transpose2d_prepack", TORCH_FN(QConvPackWeightInt8<2>::run_deconv)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose1d_prepack"), TORCH_FN(QConv1dPackWeightInt8::run_deconv)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::conv_transpose2d_prepack"), TORCH_FN(QConvPackWeightInt8<2>::run_deconv)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp index 9e8a103cb17c..0886fdc7342e 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_unpack.cpp @@ -243,36 +243,36 @@ class QConvTranspose final { TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { // conv_unpack is deprecated, please use conv2d_unpack for 2D conv. - m.impl("conv_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); // We use conv2d_unpack to be consistent with conv3d_unpack - m.impl("conv1d_unpack", TORCH_FN(QConv1dUnpackWeightsInt8::run)); - m.impl("conv2d_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run)); - m.impl("conv3d_unpack", TORCH_FN(QConvUnpackWeightsInt8<3>::run)); - - m.impl("conv2d_stride", TORCH_FN(QConvStride<2>::run)); - m.impl("conv2d_padding", TORCH_FN(QConvPadding<2>::run)); - m.impl("conv2d_output_padding", TORCH_FN(QConvOutputPadding<2>::run)); - m.impl("conv2d_dilation", TORCH_FN(QConvDilation<2>::run)); - m.impl("conv2d_groups", TORCH_FN(QConvGroups<2>::run)); - m.impl("conv2d_transpose", TORCH_FN(QConvTranspose<2>::run)); - - m.impl("conv3d_stride", TORCH_FN(QConvStride<3>::run)); - m.impl("conv3d_padding", TORCH_FN(QConvPadding<3>::run)); - m.impl("conv3d_output_padding", TORCH_FN(QConvOutputPadding<3>::run)); - m.impl("conv3d_dilation", TORCH_FN(QConvDilation<3>::run)); - m.impl("conv3d_groups", TORCH_FN(QConvGroups<3>::run)); - m.impl("conv3d_transpose", TORCH_FN(QConvTranspose<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<3>::run)); + + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_stride"), TORCH_FN(QConvStride<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_padding"), TORCH_FN(QConvPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_dilation"), TORCH_FN(QConvDilation<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_groups"), TORCH_FN(QConvGroups<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_transpose"), TORCH_FN(QConvTranspose<2>::run)); + + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_stride"), TORCH_FN(QConvStride<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_padding"), TORCH_FN(QConvPadding<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_output_padding"), TORCH_FN(QConvOutputPadding<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_dilation"), TORCH_FN(QConvDilation<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_groups"), TORCH_FN(QConvGroups<3>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv3d_transpose"), TORCH_FN(QConvTranspose<3>::run)); // ConvTranspose is the same, however, we want to have different name. - m.impl("conv_transpose1d_unpack", TORCH_FN(QConv1dUnpackWeightsInt8::run)); - m.impl("conv_transpose2d_unpack", TORCH_FN(QConvUnpackWeightsInt8<2>::run)); - - m.impl("conv_transpose2d_stride", TORCH_FN(QConvStride<2>::run)); - m.impl("conv_transpose2d_padding", TORCH_FN(QConvPadding<2>::run)); - m.impl("conv_transpose2d_output_padding", TORCH_FN(QConvOutputPadding<2>::run)); - m.impl("conv_transpose2d_dilation", TORCH_FN(QConvDilation<2>::run)); - m.impl("conv_transpose2d_groups", TORCH_FN(QConvGroups<2>::run)); - m.impl("conv_transpose2d_transpose", TORCH_FN(QConvTranspose<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose1d_unpack"), TORCH_FN(QConv1dUnpackWeightsInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_unpack"), TORCH_FN(QConvUnpackWeightsInt8<2>::run)); + + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_stride"), TORCH_FN(QConvStride<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_padding"), TORCH_FN(QConvPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_output_padding"), TORCH_FN(QConvOutputPadding<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_dilation"), TORCH_FN(QConvDilation<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_groups"), TORCH_FN(QConvGroups<2>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::conv_transpose2d_transpose"), TORCH_FN(QConvTranspose<2>::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qelu.cpp b/aten/src/ATen/native/quantized/cpu/qelu.cpp index 92b635471e78..e873506026e6 100644 --- a/aten/src/ATen/native/quantized/cpu/qelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qelu.cpp @@ -24,8 +24,8 @@ Tensor quantized_celu(const Tensor& qx, double output_scale, int64_t output_zero } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("elu", quantized_elu); - m.impl("celu", quantized_celu); + m.impl(TORCH_SELECTIVE_NAME("quantized::elu"), quantized_elu); + m.impl(TORCH_SELECTIVE_NAME("quantized::celu"), quantized_celu); } }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index da494936aad7..cb82d9aee469 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -451,14 +451,12 @@ class QEmbedding final { TORCH_LIBRARY_IMPL(quantized, CPU, m) { // Function that works on TorchBind packed weights. - m.impl("embedding_bag_byte", TORCH_FN(QEmbeddingBag<8>::run)); - m.impl("embedding_byte", TORCH_FN(QEmbedding<8>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte"), TORCH_FN(QEmbeddingBag<8>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_byte"), TORCH_FN(QEmbedding<8>::run)); // Functions that work on at::Tensor packed weight. - m.impl( - "embedding_bag_byte_rowwise_offsets", embedding_bag_byte_rowwise_offsets); - m.impl( - "embedding_bag_4bit_rowwise_offsets", embedding_bag_4bit_rowwise_offsets); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_rowwise_offsets"), embedding_bag_byte_rowwise_offsets); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_rowwise_offsets"), embedding_bag_4bit_rowwise_offsets); } } // namespace } // namespace native diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index dc1f26345e62..e94f0be0d802 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -104,8 +104,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { embedding_rows, embedding_cols + 8}; // extra 8 bytes to store FP scale and zero_point per row. - size_t output_columns = output_shape[1]; - constexpr float kEpsilon = 1e-8f; // Allocate output packed weights auto output = at::empty( @@ -114,6 +112,12 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { weight_contig.suggest_memory_format()); auto* output_data = output.data_ptr(); +#ifdef USE_FBGEMM + fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat( + weight_data, embedding_rows, embedding_cols, output_data); +#else + size_t output_columns = output_shape[1]; + constexpr float kEpsilon = 1e-8f; for (std::size_t row = 0; row < embedding_rows; ++row) { const float* input_row = weight_data + row * embedding_cols; std::uint8_t* output_row = output_data + row * output_columns; @@ -134,10 +138,15 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) { lrintf((input_row[col] - minimum_element) * inverse_scale); } // embedding_cols } // embedding_rows +#endif // USE_FBGEMM + return output; } -Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { +Tensor _qembeddingbag_nbit_prepack_helper( + const Tensor& weight, + int bit_width, + bool optimized_qparams) { int64_t embedding_rows = weight.size(0); int64_t embedding_cols = weight.size(1); @@ -145,16 +154,16 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { const auto weight_data = weight.data_ptr(); TORCH_CHECK( - BIT_RATE == 4 || BIT_RATE == 2, - "BIT_RATE must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'." - "For 8bit, consider using 'embedding_bag_byte_prepack'."); + bit_width == 4 || bit_width == 2, + "bit_width must be either 2 or 4 to use 'qembeddingbag_nbit_prepack'." + "For 8bit, consider using 'embedding_bag_byte_prepack'."); - int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; + int NUM_ELEM_PER_BYTE = 8 / bit_width; TORCH_CHECK( weight_contig.size(weight.dim() - 1) % NUM_ELEM_PER_BYTE == 0, - "qembeddingbag_" + c10::to_string(BIT_RATE) + - "bit_prepack only works for the number of columns a multiple of " - + c10::to_string(NUM_ELEM_PER_BYTE)); + "qembeddingbag_" + c10::to_string(bit_width) + + "bit_prepack only works for the number of columns a multiple of " + + c10::to_string(NUM_ELEM_PER_BYTE)); // The "fused" representation stores the scale and bias with the // row-wise quantized data in one tensor. @@ -172,55 +181,75 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { weight_contig.options().dtype(at::kByte), weight_contig.suggest_memory_format()); auto* output_data = output.data_ptr(); - const auto output_columns = output.size(output.dim() - 1); - - for (int row = 0; row < embedding_rows; ++row) { - const float* input_row = weight_data + row * embedding_cols; - std::uint8_t* output_row = output_data + row * output_columns; - float Xmin = *std::min_element(input_row, input_row + embedding_cols); - float Xmax = *std::max_element(input_row, input_row + embedding_cols); - - Xmin = static_cast(Xmin); - const float range = Xmax - Xmin; - - // Set scale to 1.0f for the corner case of Xmax == Xmin . - // Any non-zero scale would work because during quantization - // (X - Xmin) / scale will be 0 for all X unless scale is 0. - at::Half scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1); - float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale; - if (scale == 0 || std::isinf(inverse_scale)) { - // Corner case handling when Xmax == Xmin - // Any scale would work because X - Xmin will be 0 for all X - scale = 1.0f; - inverse_scale = 1.0f; - } - - // Update the scale and zero_point of each row. - at::Half* output_row_scale_zp = reinterpret_cast( - output_row + - (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); - - output_row_scale_zp[0] = scale; - output_row_scale_zp[1] = Xmin; - - // Pack the weight values. - for (int col = 0; col < embedding_cols; ++col) { - float X = input_row[col]; - std::uint8_t quantized = std::max( - 0, - std::min( - lrintf((X - Xmin) * inverse_scale), (1 << BIT_RATE) - 1)); - // We pack 2 4-bit values in a byte. Index 0 is packed in the lower 4-bits - // and index 1 is packed in the upper 4-bits. - if (col % NUM_ELEM_PER_BYTE == 0) { - output_row[col / NUM_ELEM_PER_BYTE] = quantized; +#ifdef USE_FBGEMM + if (!optimized_qparams) { + fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf( + bit_width, weight_data, embedding_rows, embedding_cols, output_data); + } else { +#endif // USE_FBGEMM + const auto output_columns = output.size(output.dim() - 1); + + for (int row = 0; row < embedding_rows; ++row) { + const float* input_row = weight_data + row * embedding_cols; + std::uint8_t* output_row = output_data + row * output_columns; + + float Xmin, Xmax; + if (optimized_qparams) { + at::Tensor xmax_tensor, xmin_tensor; + std::tie(xmax_tensor, xmin_tensor) = at::choose_qparams_optimized( + weight_contig[row], embedding_cols, 200, 0.16, bit_width); + TORCH_CHECK( + xmax_tensor.numel() == 1 && xmin_tensor.numel() == 1, + "Expected choose_qparams_optimized to return min/max tensors of size 1"); + Xmax = xmax_tensor.item(); + Xmin = xmin_tensor.item(); } else { - output_row[col / NUM_ELEM_PER_BYTE] |= - (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE)); + Xmin = *std::min_element(input_row, input_row + embedding_cols); + Xmax = *std::max_element(input_row, input_row + embedding_cols); } - } // embedding_cols - } // embedding_rows + Xmin = static_cast(Xmin); + float range = Xmax - Xmin; + // Set scale to 1.0f for the corner case of Xmax == Xmin . + // Any non-zero scale would work because during quantization + // (X - Xmin) / scale will be 0 for all X unless scale is 0. + at::Half scale = range == 0 ? 1.0f : range / ((1 << bit_width) - 1); + float inverse_scale = scale == 0 ? 1.0f : 1.0f / scale; + if (scale == 0 || std::isinf(inverse_scale)) { + // Corner case handling when Xmax == Xmin + // Any scale would work because X - Xmin will be 0 for all X + scale = 1.0f; + inverse_scale = 1.0f; + } + // Update the scale and zero_point of each row. + at::Half* output_row_scale_zp = reinterpret_cast( + output_row + + (embedding_cols + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); + + output_row_scale_zp[0] = scale; + output_row_scale_zp[1] = Xmin; + + // Pack the weight values. + for (int col = 0; col < embedding_cols; ++col) { + float X = input_row[col]; + std::uint8_t quantized = std::max( + 0, + std::min( + lrintf((X - Xmin) * inverse_scale), (1 << bit_width) - 1)); + // We pack 2 4-bit values in a byte. Index 0 is packed in the lower + // 4-bits and index 1 is packed in the upper 4-bits. + if (col % NUM_ELEM_PER_BYTE == 0) { + output_row[col / NUM_ELEM_PER_BYTE] = quantized; + } else { + output_row[col / NUM_ELEM_PER_BYTE] |= + (quantized << ((col % NUM_ELEM_PER_BYTE) * bit_width)); + } + } // embedding_cols + } // embedding_rows +#ifdef USE_FBGEMM + } +#endif // USE_FBGEMM + return output; } @@ -231,8 +260,11 @@ Tensor _qembeddingbag_nbit_prepack_helper(const Tensor& weight, int BIT_RATE) { // To later de-quantize values, the scale (range / 15) and zero_point // are stored alongside the data. More precisely, each row first has quantized // values, and then 2-byte fp16 scale and 2-byte zero_offset. -Tensor qembeddingbag_4bit_prepack(const Tensor& weight) { - return _qembeddingbag_nbit_prepack_helper(weight, 4 /*BIT_RATE*/); +Tensor qembeddingbag_4bit_prepack( + const Tensor& weight, + bool optimized_qparams) { + return _qembeddingbag_nbit_prepack_helper( + weight, 4 /*bit_width*/, optimized_qparams); } // Applies 2-bit row-wise quantization by determining the range @@ -243,8 +275,11 @@ Tensor qembeddingbag_4bit_prepack(const Tensor& weight) { // are stored alongside the data. More precisely, each row first has quantized // values, and then 2-byte fp16 scale and 2-byte zero_offset. // TODO() - Add 2Bit Embedding Lookup operator. -Tensor qembeddingbag_2bit_prepack(const Tensor& weight) { - return _qembeddingbag_nbit_prepack_helper(weight, 2 /*BIT_RATE*/); +Tensor qembeddingbag_2bit_prepack( + const Tensor& weight, + bool optimized_qparams) { + return _qembeddingbag_nbit_prepack_helper( + weight, 2 /*bit_width*/, optimized_qparams); } class QEmbeddingPackWeights final { @@ -255,13 +290,13 @@ class QEmbeddingPackWeights final { }; TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl("embedding_bag_byte_prepack", qembeddingbag_byte_prepack); - m.impl("embedding_bag_4bit_prepack", qembeddingbag_4bit_prepack); - m.impl("embedding_bag_2bit_prepack", qembeddingbag_2bit_prepack); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"), TORCH_FN(qembeddingbag_byte_prepack)); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"), TORCH_FN(qembeddingbag_4bit_prepack)); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"), TORCH_FN(qembeddingbag_2bit_prepack)); } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("embedding_bag_prepack", TORCH_FN(QEmbeddingPackWeights::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_prepack"), TORCH_FN(QEmbeddingPackWeights::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp index 72d42c61d0e5..ca3d9dc71c7e 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp @@ -73,6 +73,10 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) { packed_weight.suggest_memory_format()); float* output_data = output.data_ptr(); +#ifdef USE_FBGEMM + fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat( + input, input_rows, input_columns, output_data); +#else for (std::size_t row = 0; row < input_rows; ++row) { const std::uint8_t* input_row = input + row * input_columns; const float* input_row_scale_zp = @@ -84,14 +88,17 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) { input_row[col] * input_row_scale_zp[0] + input_row_scale_zp[1]; } // output_columns } // input_rows +#endif // USE_FBGEMM return output; } -Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RATE) { +Tensor _qembeddingbag_nbit_unpack_helper( + const Tensor& packed_weight, + int BIT_RATE) { const auto input_rows = packed_weight.size(0); const auto input_columns = packed_weight.size(1); const auto* input_data = packed_weight.data_ptr(); - int NUM_ELEM_PER_BYTE = 8/BIT_RATE; + int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; // The last 4 bytes per row are two fp16 scale and zero_point. // The rest of input_columns is the number of values in the original row. @@ -105,6 +112,10 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA packed_weight.options().dtype(kFloat), packed_weight.suggest_memory_format()); float* output_data = output.data_ptr(); +#ifdef USE_FBGEMM + fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat( + BIT_RATE, input_data, input_rows, input_columns, output_data); +#else auto output_columns = output_dimensions[1]; for (size_t row = 0; row < input_rows; ++row) { float* output_row = output_data + row * output_columns; @@ -122,6 +133,8 @@ Tensor _qembeddingbag_nbit_unpack_helper(const Tensor& packed_weight, int BIT_RA output_row[col] = scale * quantized + zero_point; } // output_columns } // input_rows +#endif // USE_FBGEMM + return output; } @@ -158,15 +171,15 @@ class QEmbeddingUnpackWeights final { }; TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl("embedding_bag_byte_unpack", qembeddingbag_byte_unpack); - m.impl("embedding_bag_4bit_unpack", qembeddingbag_4bit_unpack); - m.impl("embedding_bag_2bit_unpack", qembeddingbag_2bit_unpack); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_unpack"), qembeddingbag_byte_unpack); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_unpack"), qembeddingbag_4bit_unpack); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_unpack"), qembeddingbag_2bit_unpack); } TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { // Unpack the packed embedding_bag weights using TorchBind custom class. // TODO extend to support 4-bit qtensor. - m.impl("embedding_bag_unpack", TORCH_FN(QEmbeddingUnpackWeights::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::embedding_bag_unpack"), TORCH_FN(QEmbeddingUnpackWeights::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp index f0dbd644b2be..064b88a8c91f 100644 --- a/aten/src/ATen/native/quantized/cpu/qhardswish.cpp +++ b/aten/src/ATen/native/quantized/cpu/qhardswish.cpp @@ -85,7 +85,7 @@ Tensor quantized_hardswish(const Tensor& qx, double output_scale, int64_t output } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("hardswish", TORCH_FN(quantized_hardswish)); + m.impl(TORCH_SELECTIVE_NAME("quantized::hardswish"), TORCH_FN(quantized_hardswish)); } }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index fdc6d1dd4d8b..a7b4f4b74357 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -397,12 +397,12 @@ class QLinearInt8 final { }; TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("linear", TORCH_FN(QLinearInt8::run)); - m.impl("linear_relu", TORCH_FN(QLinearInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear"), TORCH_FN(QLinearInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu"), TORCH_FN(QLinearInt8::run)); } TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) { - m.impl("linear", TORCH_FN(QLinearInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::linear"), TORCH_FN(QLinearInt8::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index 2accf060deab..af2d7749ee50 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -455,13 +455,13 @@ class QLinearDynamicFp16 final { }; TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl("linear_dynamic", TORCH_FN(QLinearDynamicInt8::run)); - m.impl("linear_relu_dynamic", TORCH_FN(QLinearDynamicInt8::run)); - m.impl("linear_dynamic_fp16", TORCH_FN(QLinearDynamicFp16::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic"), TORCH_FN(QLinearDynamicInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16::run)); } TORCH_LIBRARY_IMPL(_quantized, CPU, m) { - m.impl("linear_dynamic", TORCH_FN(QLinearDynamicInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index ee4b6ee2aaf6..23912f87d123 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -318,22 +318,22 @@ class QLinearPackWeightFp16Legacy final { }; TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("linear_prepack", TORCH_FN(QLinearPackWeightInt8::run)); - m.impl("linear_prepack_legacy", TORCH_FN(QLinearPackWeightInt8Legacy::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_legacy"), TORCH_FN(QLinearPackWeightInt8Legacy::run)); } TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl("linear_prepack_fp16", TORCH_FN(QLinearPackWeightFp16::run)); - m.impl("linear_prepack_fp16_legacy", TORCH_FN(QLinearPackWeightFp16Legacy::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run)); } TORCH_LIBRARY_IMPL(_quantized, QuantizedCPU, m) { - m.impl("linear_prepack", TORCH_FN(QLinearPackWeightInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack"), TORCH_FN(QLinearPackWeightInt8::run)); } TORCH_LIBRARY_IMPL(_quantized, CPU, m) { - m.impl("linear_prepack_fp16", TORCH_FN(QLinearPackWeightFp16::run)); - m.impl("linear_prepack_fp16_legacy", TORCH_FN(QLinearPackWeightFp16Legacy::run)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16"), TORCH_FN(QLinearPackWeightFp16::run)); + m.impl(TORCH_SELECTIVE_NAME("_quantized::linear_prepack_fp16_legacy"), TORCH_FN(QLinearPackWeightFp16Legacy::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp index 1bc8711a22f4..ecbae04dd957 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_unpack.cpp @@ -137,13 +137,13 @@ class QLinearUnpackWeightFp16Legacy final { }; TORCH_LIBRARY_IMPL(quantized, CPU, m) { - m.impl("linear_unpack.legacy", TORCH_FN(QLinearUnpackWeightInt8Legacy::run)); - m.impl("linear_unpack_fp16.legacy", TORCH_FN(QLinearUnpackWeightFp16Legacy::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack.legacy"), TORCH_FN(QLinearUnpackWeightInt8Legacy::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16.legacy"), TORCH_FN(QLinearUnpackWeightFp16Legacy::run)); } TORCH_LIBRARY_IMPL(quantized, CatchAll, m) { - m.impl("linear_unpack", TORCH_FN(QLinearUnpackWeightInt8::run)); - m.impl("linear_unpack_fp16", TORCH_FN(QLinearUnpackWeightFp16::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack"), TORCH_FN(QLinearUnpackWeightInt8::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::linear_unpack_fp16"), TORCH_FN(QLinearUnpackWeightFp16::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp index 13aa8acc669a..deeae36dc502 100644 --- a/aten/src/ATen/native/quantized/cpu/qmul.cpp +++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp @@ -173,26 +173,26 @@ class QMulScalarTensorOut final { }; TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("mul", TORCH_FN(QMul::run)); - m.impl("mul.out", TORCH_FN(QMulOut::run)); - m.impl("mul.Scalar", TORCH_FN(QMulScalar::run)); - m.impl("mul.Scalar_out", TORCH_FN(QMulScalarOut::run)); - m.impl("mul_relu", TORCH_FN(QMul::run)); - m.impl("mul_relu.out", TORCH_FN(QMulOut::run)); - m.impl("mul_relu.Scalar", TORCH_FN(QMulScalar::run)); - m.impl("mul_relu.Scalar_out", TORCH_FN(QMulScalarOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul"), TORCH_FN(QMul::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul.out"), TORCH_FN(QMulOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar"), TORCH_FN(QMulScalar::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul.Scalar_out"), TORCH_FN(QMulScalarOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu"), TORCH_FN(QMul::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.out"), TORCH_FN(QMulOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar"), TORCH_FN(QMulScalar::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu.Scalar_out"), TORCH_FN(QMulScalarOut::run)); // deprecated functions, kept for backward compatibility - m.impl("mul_out", TORCH_FN(QMulOut::run)); - m.impl("mul_relu_out", TORCH_FN(QMulOut::run)); - m.impl("mul_scalar", TORCH_FN(QMulScalar::run)); - m.impl("mul_scalar_relu", TORCH_FN(QMulScalar::run)); - m.impl("mul_scalar_out", TORCH_FN(QMulScalarOut::run)); - m.impl("mul_scalar_relu_out", TORCH_FN(QMulScalarOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_out"), TORCH_FN(QMulOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_relu_out"), TORCH_FN(QMulOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar"), TORCH_FN(QMulScalar::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu"), TORCH_FN(QMulScalar::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_out"), TORCH_FN(QMulScalarOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out"), TORCH_FN(QMulScalarOut::run)); // TODO: remove after broadcasting is supported - m.impl("mul_scalar.Tensor", TORCH_FN(QMulScalarTensor::run)); - m.impl("mul_scalar_relu.Tensor", TORCH_FN(QMulScalarTensor::run)); - m.impl("mul_scalar_out.Tensor", TORCH_FN(QMulScalarTensorOut::run)); - m.impl("mul_scalar_relu_out.Tensor", TORCH_FN(QMulScalarTensorOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar.Tensor"), TORCH_FN(QMulScalarTensor::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu.Tensor"), TORCH_FN(QMulScalarTensor::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_out.Tensor"), TORCH_FN(QMulScalarTensorOut::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::mul_scalar_relu_out.Tensor"), TORCH_FN(QMulScalarTensorOut::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp index f5bef2b93a0a..6ed193cd82c9 100644 --- a/aten/src/ATen/native/quantized/cpu/qnormalization.cpp +++ b/aten/src/ATen/native/quantized/cpu/qnormalization.cpp @@ -120,7 +120,7 @@ Tensor quantized_instance_norm_impl( TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { // TODO: this is kind of... blegh - m.impl("layer_norm", []( + m.impl(TORCH_SELECTIVE_NAME("quantized::layer_norm"), []( Tensor input, std::vector normalized_shape, // because IntArrayRef doesn't work c10::optional weight, @@ -134,7 +134,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { bias.has_value() ? *bias : Tensor(), eps, output_scale, output_zero_point); }); - m.impl("group_norm", []( + m.impl(TORCH_SELECTIVE_NAME("quantized::group_norm"), []( Tensor qx, int64_t num_groups, c10::optional weight, @@ -148,7 +148,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { bias.has_value() ? *bias : Tensor(), eps, output_scale, output_zero_point); }); - m.impl("instance_norm", []( + m.impl(TORCH_SELECTIVE_NAME("quantized::instance_norm"), []( Tensor qx, c10::optional weight, c10::optional bias, diff --git a/aten/src/ATen/native/quantized/cpu/qpool.cpp b/aten/src/ATen/native/quantized/cpu/qpool.cpp index f986ab4934b9..7fa56619609b 100644 --- a/aten/src/ATen/native/quantized/cpu/qpool.cpp +++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp @@ -134,7 +134,12 @@ Tensor q_maxpool_2d( int64_t oC = iC; int64_t oH = pooling_output_shape(iH, kH, pH, sH, dH, ceil_mode); int64_t oW = pooling_output_shape(iW, kW, pW, sW, dW, ceil_mode); - TORCH_CHECK(oH > 0 && oW > 0, "the resulting Tensor is too small."); + TORCH_CHECK(oH > 0 && oW > 0, + "Given input size: (", + iC, "x", iH, "x", iW, + "). Calculated output size: (", + oC, "x", oH, "x", oW, + "). Output size is too small."); std::vector oSizes; if (ndim == 3) { @@ -232,7 +237,7 @@ void check_maxpool2d_params( } #ifdef USE_PYTORCH_QNNPACK - static Tensor qnnpack_maxpool( + static Tensor qnnpack_maxpool2d( Tensor input, IntArrayRef kernel_size, IntArrayRef stride, @@ -243,23 +248,23 @@ void check_maxpool2d_params( TORCH_CHECK( input.ndimension() == 4, - "qnnpack_maxpool(): Expected input to be 4-dimensional: got ", + "qnnpack_maxpool2d(): Expected input to be 4-dimensional: got ", input.ndimension()); TORCH_CHECK( kernel_size.size() == 2, - "qnnpack_maxpool(): Expected kernel_size to be 2-dimensional: got ", + "qnnpack_maxpool2d(): Expected kernel_size to be 2-dimensional: got ", kernel_size.size()); TORCH_CHECK( stride.size() == 2, - "qnnpack_maxpool(): Expected stride to be 2-dimensional: got ", + "qnnpack_maxpool2d(): Expected stride to be 2-dimensional: got ", stride.size()); TORCH_CHECK( dilation.size() == 2, - "qnnpack_maxpool(): Expected dilation to be 2-dimensional: got ", + "qnnpack_maxpool2d(): Expected dilation to be 2-dimensional: got ", dilation.size()); TORCH_CHECK( padding.size() == 2, - "qnnpack_maxpool(): Expected padding to be 2-dimensional: got ", + "qnnpack_maxpool2d(): Expected padding to be 2-dimensional: got ", padding.size()); int64_t batch_size = input.size(0); @@ -284,10 +289,10 @@ void check_maxpool2d_params( TORCH_CHECK( kH > 0 && kW > 0, - "qnnpack_maxpool(): kernel_size should be greater than zero."); + "qnnpack_maxpool2d(): kernel_size should be greater than zero."); TORCH_CHECK( strideH > 0 && strideW > 0, - "qnnpack_maxpool(): strides should be greater than zero."); + "qnnpack_maxpool2d(): strides should be greater than zero."); const pytorch_qnnp_status createStatus = pytorch_qnnp_create_max_pooling2d_nhwc_u8( @@ -318,7 +323,7 @@ void check_maxpool2d_params( TORCH_CHECK( outH > 0 && outW > 0, - "qnnpack_maxpool(): the resulting output Tensor size should be >= 0"); + "qnnpack_maxpool2d(): the resulting output Tensor size should be >= 0"); std::unique_ptr qnnpack_uniq_ptr(qnnpack_operator); @@ -375,7 +380,7 @@ Tensor quantized_max_pool2d( } #ifdef USE_PYTORCH_QNNPACK if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8 && !ceil_mode) { - return qnnpack_maxpool(qx, kernel_size, stride, padding, dilation, ceil_mode); + return qnnpack_maxpool2d(qx, kernel_size, stride, padding, dilation, ceil_mode); } #endif Tensor qy; @@ -395,9 +400,37 @@ Tensor quantized_max_pool2d( return qy; } +// Quantized max_pool1d is a special case of the max_pool2d, with one of the +// dimensions and kernels removed. +Tensor quantized_max_pool1d( + const Tensor& qx, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { + // (C, L) -> (C, 1, L) => kSqueezeDim = 1 + // (N, C, L) -> (N, C, 1, L) => kSqueezeDim = 2 + const int32_t kSqueezeDim = qx.dim() - 1; + const auto qx_unsqueeze = qx.unsqueeze(kSqueezeDim); + if (stride.empty()) { + stride = kernel_size; + } + auto qy = at::quantized_max_pool2d( + qx.unsqueeze(kSqueezeDim), + {1, kernel_size[0]}, + {1, stride[0]}, + {0, padding[0]}, + {1, dilation[0]}, + ceil_mode); + qy = qy.squeeze(kSqueezeDim); + return qy; +} + // Keep the registry in the anonymous namespace. namespace { -class QMaxPool2D_arr_args final { +template +class QMaxPool_arr_args final { public: static Tensor run( Tensor qx, @@ -406,17 +439,20 @@ class QMaxPool2D_arr_args final { std::vector padding, std::vector dilation, bool ceil_mode) { - #ifdef USE_PYTORCH_QNNPACK - if (at::globalContext().qEngine() == at::QEngine::QNNPACK && qx.scalar_type() == kQUInt8 && !ceil_mode) { - return qnnpack_maxpool(qx, kernel_size, stride, padding, dilation, ceil_mode); + if (kSpatialDim == 1) { + return at::quantized_max_pool1d(qx, kernel_size, stride, padding, + dilation, ceil_mode); + } else if (kSpatialDim == 2) { + return at::quantized_max_pool2d(qx, kernel_size, stride, padding, + dilation, ceil_mode); } - #endif - return at::max_pool2d(qx, kernel_size, stride, padding, dilation, ceil_mode); + TORCH_CHECK(false, "MaxPool", kSpatialDim, "D is not supported."); } }; TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("max_pool2d", TORCH_FN(QMaxPool2D_arr_args::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool1d"), TORCH_FN(QMaxPool_arr_args<1>::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::max_pool2d"), TORCH_FN(QMaxPool_arr_args<2>::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qreduction.cpp b/aten/src/ATen/native/quantized/cpu/qreduction.cpp index 739638b7a67e..74b266114230 100644 --- a/aten/src/ATen/native/quantized/cpu/qreduction.cpp +++ b/aten/src/ATen/native/quantized/cpu/qreduction.cpp @@ -83,7 +83,14 @@ Tensor& mean_out_quantized_cpu( c10::optional opt_dtype) { #ifdef USE_PYTORCH_QNNPACK if (at::globalContext().qEngine() == at::QEngine::QNNPACK && - self.scalar_type() == kQUInt8) { + self.scalar_type() == kQUInt8 && + // QNNPACK currently is only supported for NCHW + dim=(2, 3) + // Remove these checks after generic version is implemented. + self.ndimension() == 4 && + dim.size() == 2 && + dim[0] == 2 && + dim[1] == 3 + ){ result = qnnpack_mean(self, dim); return result; } diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp index 447e5cb23af5..ca03081a1a25 100644 --- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp @@ -113,7 +113,7 @@ Tensor& leaky_relu_out_quantized_cpu(Tensor& result, const Tensor& self, return result; } -Tensor heaky_relu_quantized_cpu(const Tensor& self, Scalar negval) { +Tensor leaky_relu_quantized_cpu(const Tensor& self, Scalar negval) { const auto qx = self.contiguous(self.suggest_memory_format()); auto qy = at::_empty_affine_quantized(qx.sizes(), at::device(kCPU).dtype(self.scalar_type()), @@ -170,8 +170,27 @@ class QRelu6 final { } }; +class QLeakyRelu final { + public: + static Tensor run(Tensor self, Scalar negative_slope, bool inplace, double output_scale, int64_t output_zero_point) { + // inplace argument is ignored now, TODO:support inplace + if (inplace) { + TORCH_WARN("inplace=True is not supported for quantized::leaky_relu yet"); + } + const auto qx = self.contiguous(self.suggest_memory_format()); + auto qy = at::_empty_affine_quantized(qx.sizes(), + at::device(kCPU).dtype(self.scalar_type()), + output_scale, + output_zero_point, + self.suggest_memory_format()); + qrelu_leaky_stub(self.device().type(), qy, qx, negative_slope); + return qy; + } +}; + TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("relu6", TORCH_FN(QRelu6::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::relu6"), TORCH_FN(QRelu6::run)); + m.impl(TORCH_SELECTIVE_NAME("quantized::leaky_relu"), TORCH_FN(QLeakyRelu::run)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qthreshold.cpp b/aten/src/ATen/native/quantized/cpu/qthreshold.cpp index 281274d27be2..a42da4081c71 100644 --- a/aten/src/ATen/native/quantized/cpu/qthreshold.cpp +++ b/aten/src/ATen/native/quantized/cpu/qthreshold.cpp @@ -35,7 +35,7 @@ Tensor threshold_quantized_cpu( } TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { - m.impl("threshold", TORCH_FN(threshold_quantized_cpu)); + m.impl(TORCH_SELECTIVE_NAME("quantized::threshold"), TORCH_FN(threshold_quantized_cpu)); } } // namespace native diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index c8e247b42365..dceb06b05d4a 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -20,174 +20,158 @@ TORCH_LIBRARY(quantized, m) { register_conv_params<3>(); register_embedding_params(); - m.def("add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"); - m.def("add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add.Scalar(Tensor qa, Scalar b) -> Tensor qc"); - m.def("add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"); - m.def("add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc"); - m.def("add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar(Tensor qa, Scalar b) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar(Tensor qa, Scalar b) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out")); // deprecated functions, kept for backward compatibility - m.def("add_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add_relu_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add_scalar(Tensor qa, Scalar b) -> Tensor qc"); - m.def("add_scalar_relu(Tensor qa, Scalar b) -> Tensor qc"); - m.def("add_scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_relu_out(Tensor qa, Tensor qb, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar(Tensor qa, Scalar b) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu(Tensor qa, Scalar b) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out) -> Tensor(a!) out")); // TODO: remove after broadcasting is supported - m.def("add_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out"); - m.def("add_scalar.Tensor(Tensor qa, Tensor b) -> Tensor qc"); - m.def("add_scalar_relu.Tensor(Tensor qa, Tensor b) -> Tensor qc"); - m.def("add_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar.Tensor(Tensor qa, Tensor b) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu.Tensor(Tensor qa, Tensor b) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::add_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out) -> Tensor(a!) out")); // This is needed for graph mode quantization, when we fuse // dequant - aten::batch_norm - quant into quantized::batch_norm // and dimension is unknown given only the aten op call // quantized::batch_norm supports both 2d and 3d batch norm right now // it should also support 1d batch_norm after quantized::batch_norm1d is // implemented - m.def("batch_norm(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm1d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm1d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm2d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm2d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm3d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("batch_norm3d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("clamp(Tensor qx, Scalar? min, Scalar? max) -> Tensor qy"); - m.def("threshold(Tensor qx, Scalar threshold, Scalar value) -> Tensor qy"); - m.def("cat(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor"); - m.def("cat_relu(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor"); - m.def("cat_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)"); - m.def("cat_relu_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)"); - m.def("conv1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv3d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm1d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm1d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm2d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm2d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm3d(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::batch_norm3d_relu(Tensor qx, Tensor? weight, Tensor? bias, Tensor mean, Tensor var, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::clamp(Tensor qx, Scalar? min, Scalar? max) -> Tensor qy")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::threshold(Tensor qx, Scalar threshold, Scalar value) -> Tensor qy")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_relu(Tensor[] qx, int dim, float? scale, int? zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::cat_relu_out(Tensor[] qx, int dim, Tensor(a!) out) -> Tensor(a!)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase weight, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point) -> Tensor")); // conv_prepack is deprecated, please use conv2d_prepack for 2D conv. - m.def("conv_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"); - m.def("conv1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"); - m.def("conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"); - m.def("conv3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv3dPackedParamsBase")); // conv_unpack is deprecated, please use conv2d_unpack for 2D conv. - m.def("conv_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"); - m.def("conv1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"); - m.def("conv2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"); - m.def("conv3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"); - m.def("conv2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"); - m.def("conv2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"); - m.def("conv2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"); - m.def("conv2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"); - m.def("conv2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"); - m.def("conv3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"); - m.def("conv3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"); - m.def("conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"); - m.def("conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_unpack(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_stride(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_padding(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int")); // conv_tranpsose - m.def("conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"); - m.def("conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"); - m.def("conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"); - m.def("conv_transpose2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)"); - m.def("conv_transpose2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"); - m.def("conv_transpose2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"); - m.def("conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]"); - m.def("conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"); - m.def("conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_unpack(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> (Tensor unpacked_weights, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_stride(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_padding(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_dilation(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int[]")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_groups(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d_transpose(__torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weights) -> int")); - m.def("elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor"); - m.def("embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack"); - m.def("embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"); - m.def("embedding_bag_byte_prepack(Tensor weight) -> Tensor"); - m.def("embedding_bag_byte_unpack(Tensor weight) -> Tensor"); - m.def("embedding_bag_4bit_prepack(Tensor weight) -> Tensor"); - m.def("embedding_bag_4bit_unpack(Tensor weight) -> Tensor"); - m.def("embedding_bag_2bit_prepack(Tensor weight) -> Tensor"); - m.def("embedding_bag_2bit_unpack(Tensor weight) -> Tensor"); - m.def("embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor"); - m.def("embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"); - m.def("embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"); - m.def("embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor"); - m.def("celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor"); - m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"); - m.def("group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor"); - m.def("instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def("layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor"); - m.def( - "linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"); - m.def( - "linear_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"); - m.def( - "linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"); - m.def( - "linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"); - m.def( - "linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"); - m.def( - "linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"); - m.def( - "linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"); - m.def("linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"); - m.def( - "linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"); - m.def( - "linear_unpack(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)"); - m.def( - "linear_unpack_fp16(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)"); - m.def( - "linear_unpack.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)"); - m.def( - "linear_unpack_fp16.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)"); - m.def("mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"); - m.def("mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul.Scalar(Tensor qa, Scalar b)-> Tensor qc"); - m.def("mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc"); - m.def("mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc"); - m.def("mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::elu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_prepack(Tensor weight) -> __torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, bool include_last_offset=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool sparse=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool sparse=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack_fp16(__torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> (Tensor W_origin, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_unpack_fp16.legacy(Tensor W_prepack) -> (Tensor W_origin, Tensor? B_origin)")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar(Tensor qa, Scalar b)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu(Tensor qa, Tensor qb, float scale, int zero_point)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar(Tensor qa, Scalar b)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu.Scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out")); // deprecated functions, kept for backward compatibility - m.def("mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul_relu_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul_scalar(Tensor qa, Scalar b)-> Tensor qc"); - m.def("mul_scalar_relu(Tensor qa, Scalar b)-> Tensor qc"); - m.def("mul_scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_relu_out(Tensor qa, Tensor qb, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar(Tensor qa, Scalar b)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu(Tensor qa, Scalar b)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu_out(Tensor qa, Scalar b, Tensor(a!) out)-> Tensor(a!) out")); // TODO: remove after broadcasting is supported - m.def("mul_scalar.Tensor(Tensor qa, Tensor b)-> Tensor qc"); - m.def("mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc"); - m.def("mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out"); - m.def("mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out"); - // NB: missing a space after comma here... - m.def("max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation,bool ceil_mode) -> Tensor"); - m.def("relu6(Tensor qx, bool inplace=False) -> Tensor"); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar.Tensor(Tensor qa, Tensor b)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu.Tensor(Tensor qa, Tensor b)-> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::mul_scalar_relu_out.Tensor(Tensor qa, Tensor b, Tensor(a!) out)-> Tensor(a!) out")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool1d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::max_pool2d(Tensor qx, int[] kernel_size, int[] stride, int[] padding, int[] dilation, bool ceil_mode) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::relu6(Tensor qx, bool inplace=False) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::leaky_relu(Tensor qx, Scalar negative_slope, bool inplace, float output_scale, int output_zero_point) -> Tensor")); } // According to #33294: The "_" prefix registration will be // removed when the operators are all migrated to mobile. // https://github.com/pytorch/pytorch/issues/36510 TORCH_LIBRARY(_quantized, m) { - m.def("add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc"); - m.def("conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"); - m.def("conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase"); - m.def( - "linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"); - m.def( - "linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"); - m.def( - "linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"); - m.def( - "linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack"); - m.def("linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"); - m.def( - "linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack"); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::add(Tensor qa, Tensor qb, float scale, int zero_point) -> Tensor qc")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d_relu(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); } diff --git a/aten/src/ATen/native/sparse/ParamUtils.cpp b/aten/src/ATen/native/sparse/ParamUtils.cpp new file mode 100644 index 000000000000..f2a4c97571b9 --- /dev/null +++ b/aten/src/ATen/native/sparse/ParamUtils.cpp @@ -0,0 +1,53 @@ +#include +#include +#include +#include + +namespace at { +namespace native { + +std::pair softmax_sparse_input_preprocessing( + const Tensor& input_, + const int64_t dim_, + const bool half_to_float, + CheckedFrom function_name) { + TORCH_INTERNAL_ASSERT(input_.is_sparse()); + TORCH_CHECK( + !half_to_float, + std::string(function_name) + + ": with half to float conversion is not supported on " + + input_.device().str()); + auto input = input_.coalesce(); + Tensor output = at::native::empty_like(input); + TORCH_CHECK( + dim_ >= 0 && dim_ < input.dim(), + ": dim must be non-negative and less than input dimensions"); + return std::make_pair(input, output); +} + +std::tuple softmax_backward_sparse_input_preprocessing( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_, + CheckedFrom function_name) { + TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; + checkSameSize(function_name, grad_arg, output_arg); + + int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); + + auto grad = grad_.coalesce(); + auto output = output_.coalesce(); + + Tensor grad_input = at::native::empty_like(output); + TORCH_CHECK( + dim >= 0 && dim < grad.dim(), + ": dim must be non-negative and less than input dimensions"); + TORCH_CHECK( + grad.sparse_dim() == output.sparse_dim(), + ": grad and output sparse dimensions must be equal"); + return std::make_tuple(grad_input, grad, output); +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/ParamUtils.h b/aten/src/ATen/native/sparse/ParamUtils.h new file mode 100644 index 000000000000..c9b2e3d999ad --- /dev/null +++ b/aten/src/ATen/native/sparse/ParamUtils.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include +#include + +namespace at { +namespace native { + +TORCH_API std::pair softmax_sparse_input_preprocessing( + const Tensor& input_, + const int64_t dim_, + const bool half_to_float, + CheckedFrom function_name); + +TORCH_API std::tuple softmax_backward_sparse_input_preprocessing( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_, + CheckedFrom function_name); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp index 1544c6e499e7..6070faf635c5 100644 --- a/aten/src/ATen/native/sparse/SoftMax.cpp +++ b/aten/src/ATen/native/sparse/SoftMax.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace at { @@ -291,10 +292,10 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di if (dim >= sparse_dim) { if (LogSoftMax) { auto new_values = log_softmax_cpu(values, dim - sparse_dim + 1, false); - out_values.copy_(new_values); + out_values.set_(new_values); } else { auto new_values = softmax_cpu(values, dim - sparse_dim + 1, false); - out_values.copy_(new_values); + out_values.set_(new_values); } return; } @@ -411,17 +412,27 @@ void cpu_sparse_coo_softmax_backward(Tensor& grad_input, const Tensor& grad, con auto grad_offsets = get_offsets(grad_indices, sizes, -1); if (dim >= sparse_dim) { - for(int64_t i=0; i= 0 && dim_ < input.dim(), - "dim must be non-negative and less than input dimensions"); AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] { - cpu_sparse_coo_softmax(output, input, dim_); + cpu_sparse_coo_softmax(output, input, dim); }); return output; } -Tensor log_softmax_sparse_cpu(const Tensor& input_, const int64_t dim_, const bool half_to_float) { - TORCH_INTERNAL_ASSERT(input_.is_sparse()); - TORCH_CHECK(!half_to_float, "log_softmax with half to float conversion is not supported on CPU"); - auto input = input_.coalesce(); - Tensor output = at::native::empty_like(input); +Tensor log_softmax_sparse_cpu( + const Tensor& input_, + const int64_t dim, + const bool half_to_float) { + Tensor input, output; + std::tie(input, output) = softmax_sparse_input_preprocessing( + input_, dim, half_to_float, "log_softmax"); if (input.numel() == 0) { return output; } - TORCH_CHECK(dim_ >= 0 && dim_ < input.dim(), - "dim must be non-negative and less than input dimensions"); AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] { - cpu_sparse_coo_softmax(output, input, dim_); + cpu_sparse_coo_softmax(output, input, dim); }); return output; } @@ -542,26 +553,16 @@ Tensor softmax_backward_sparse_cpu( const Tensor& output_, int64_t dim_, const Tensor& input_) { - TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; - checkSameSize("softmax_backward", grad_arg, output_arg); - - int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); - - auto grad = grad_.coalesce(); - auto output = output_.coalesce(); - - Tensor grad_input = at::native::empty_like(output); + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "softmax_backward"); if (output.numel() == 0) { return grad_input; } - TORCH_CHECK( - dim >= 0 && dim < grad.dim(), - "dim must be non-negative and less than input dimensions"); - TORCH_CHECK( - grad.sparse_dim() == output.sparse_dim(), - "grad and output sparse dimensions must be equal"); AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] { - cpu_sparse_coo_softmax_backward(grad_input, grad, output, dim); + cpu_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); }); return grad_input; } @@ -571,26 +572,16 @@ Tensor log_softmax_backward_sparse_cpu( const Tensor& output_, int64_t dim_, const Tensor& input_) { - TensorArg grad_arg{grad_, "grad", 1}, output_arg{output_, "output", 2}; - checkSameSize("log_softmax_backward", grad_arg, output_arg); - - int64_t dim = maybe_wrap_dim(dim_, grad_.dim()); - - auto grad = grad_.coalesce(); - auto output = output_.coalesce(); - - Tensor grad_input = at::native::empty_like(output); + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "log_softmax_backward"); if (output.numel() == 0) { return grad_input; } - TORCH_CHECK( - dim >= 0 && dim < grad.dim(), - "dim must be non-negative and less than input dimensions"); - TORCH_CHECK( - grad.sparse_dim() == output.sparse_dim(), - "grad and output sparse dimensions must be equal"); - AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] { - cpu_sparse_coo_softmax_backward(grad_input, grad, output, dim); + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] { + cpu_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); }); return grad_input; } diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 9eee5e056dff..2bb5842b4726 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -95,16 +95,17 @@ SparseTensor& mul_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scal // log1p(SparseTensor) // -------------------------------------------------------------------- -// TODO: add in-place variant +// In-place log1p on uncoalesced tensors is not supported since the operation is not a linear map. +// Values of uncoalesced tensor corresponding to the same indices are summed +// and log1p(summed_value) != log1p(v1) + log1p(v2) SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { - AT_ASSERT(r.is_sparse()); - AT_ASSERT(t.is_sparse()); + TORCH_CHECK(r.is_sparse(), "Tensor should be sparse"); + TORCH_CHECK(t.is_sparse(), "Tensor should be sparse"); if (is_same_tensor(r, t)) { // don't have in-place log1p for uncoalesced input because coalesce() is not in-place - TORCH_CHECK( - r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); + TORCH_CHECK(r.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported"); } else { copy_sparse_to_sparse_(r, t.coalesce()); @@ -114,10 +115,53 @@ SparseTensor& log1p_out_sparse(SparseTensor& r, const SparseTensor& t) { } SparseTensor& log1p_sparse_(SparseTensor& t) { - TORCH_CHECK(t.is_coalesced(), "log1p: in-place on uncoalesced tensors is not supported yet!"); return log1p_out_sparse(t, t); } +// -------------------------------------------------------------------- +// neg(SparseTensor) +// -------------------------------------------------------------------- + +SparseTensor& neg_out_sparse(SparseTensor& r, const SparseTensor& t) { + TORCH_CHECK(r.is_sparse(), "Tensor should be sparse"); + TORCH_CHECK(t.is_sparse(), "Tensor should be sparse"); + + // copy_sparse_ does not perform the copy if it is the same tensor + copy_sparse_to_sparse_(r, t); + r._values().neg_(); + return r; +} + +SparseTensor& neg_sparse_(SparseTensor& t) { + return neg_out_sparse(t, t); +} + +// -------------------------------------------------------------------- +// asin(SparseTensor) +// -------------------------------------------------------------------- + +// In-place asin on uncoalesced tensors is not supported since the operation is not a linear map. +// Values of uncoalesced tensor corresponding to the same indices are summed +// and asin(summed_value) != asin(v1) + asin(v2) + +SparseTensor& asin_out_sparse(SparseTensor& r, const SparseTensor& t) { + TORCH_CHECK(r.is_sparse(), "Tensor should be sparse"); + TORCH_CHECK(t.is_sparse(), "Tensor should be sparse"); + + if (is_same_tensor(r, t)) { + // don't have in-place asin for uncoalesced input because coalesce() is not in-place, see above comment + TORCH_CHECK(r.is_coalesced(), "asin: in-place on uncoalesced tensors is not supported"); + } else { + copy_sparse_to_sparse_(r, t.coalesce()); + } + r._values().asin_(); + return r; +} + +SparseTensor& asin_sparse_(SparseTensor& t) { + return asin_out_sparse(t, t); +} + // -------------------------------------------------------------------- // pow(SparseTensor, Scalar) // -------------------------------------------------------------------- diff --git a/aten/src/ATen/native/sparse/cuda/SoftMax.cu b/aten/src/ATen/native/sparse/cuda/SoftMax.cu new file mode 100644 index 000000000000..26cb6aba04e0 --- /dev/null +++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu @@ -0,0 +1,641 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at { +namespace native { +namespace { + +// Number of threads in a block given an input size up to MAX_BLOCK_SIZE +static int getNumThreads(int nElem) { +#if defined(__HIP_PLATFORM_HCC__) + int threadSizes[5] = {16, 32, 64, 128, 256}; +#else + int threadSizes[5] = {32, 64, 128, 256, 512}; +#endif + for (int i = 0; i != 5; ++i) { + if (nElem <= threadSizes[i]) { + return threadSizes[i]; + } + } + return threadSizes[4]; +} + +template +__global__ void cuda_sparse_coo_softmax_kernel( + int64_t* sorted_pool_indices, + int64_t size, + int64_t* pool_sizes, + int64_t* pool_offsets, + int64_t nvalues, + scalar_t* mx_rows, + PackedTensorAccessor input_values_acc, + PackedTensorAccessor output_values_acc) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU + implementation of the sparse softmax algorithm that this implementation is + based on. + */ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int index = tid + blkid * blksz; + int step = blksz * gridsz; + + while (index < size) { + int64_t offset = pool_offsets[index]; + int64_t* pool_indices = sorted_pool_indices + offset; + int64_t pool_indices_size = pool_sizes[index]; + scalar_t* mx_row = mx_rows + index * nvalues; + + for (int64_t j = 0; j < nvalues; j++) { + scalar_t exp_sums = 0; + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto values_row = input_values_acc[i]; + auto out_values_row = output_values_acc[i]; + + auto v = c10::cuda::compat::exp(values_row[j] - mx_row[j]); + if (!LogSoftMax) { + out_values_row[j] = v; + } + exp_sums += v; + } + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto values_row = input_values_acc[i]; + auto out_values_row = output_values_acc[i]; + + if (LogSoftMax) { + out_values_row[j] = values_row[j] - mx_row[j] - c10::cuda::compat::log(exp_sums); + } else { + out_values_row[j] *= 1.0 / exp_sums; + } + } + } + index += step; + } +} + +template +__global__ void cuda_sparse_coo_softmax_backward_kernel( + int64_t* sorted_pool_indices, + int64_t size, + int64_t* pool_sizes, + int64_t* pool_offsets, + int64_t nvalues, + int64_t grad_nnz, + int64_t* grad_offsets, + int64_t* out_offsets, + int64_t* lower_bound_values, + PackedTensorAccessor values_accessor, + PackedTensorAccessor out_values_accessor, + PackedTensorAccessor grad_values_accessor) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for + the CPU implementation of the sparse softmax backward algorithm that this + implementation is based on. + */ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int index = tid + blkid * blksz; + int step = blksz * gridsz; + + while (index < size) { + int64_t offset = pool_offsets[index]; + int64_t* pool_indices = sorted_pool_indices + offset; + int64_t pool_indices_size = pool_sizes[index]; + + for (int64_t k = 0; k < nvalues; k++) { + scalar_t tmp_row{0}; + + /* Compute tmp = - sum_j output_j * grad_j */ + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto out_values_row = out_values_accessor[i]; + auto j = lower_bound_values[i]; + + /* Update `tmp_row` accumulator only when limits and pools are valid */ + if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) { + auto grad_values_row = grad_values_accessor[j]; + if (LogSoftMax) { + tmp_row -= grad_values_row[k]; + } else { + tmp_row -= out_values_row[k] * grad_values_row[k]; + } + } + } + + /* Compute grad_input = output * (grad + tmp)*/ + for (int64_t p = 0; p < pool_indices_size; p++) { + auto i = pool_indices[p]; + auto out_values_row = out_values_accessor[i]; + auto values_row = values_accessor[i]; + auto j = lower_bound_values[i]; + if (j < grad_nnz && (out_offsets[i] == grad_offsets[j])) { + auto grad_values_row = grad_values_accessor[j]; + if (LogSoftMax) { + values_row[k] = grad_values_row[k] + + c10::cuda::compat::exp(out_values_row[k]) * tmp_row; + } else { + values_row[k] = + out_values_row[k] * (grad_values_row[k] + tmp_row); + } + } else { + if (LogSoftMax) { + values_row[k] = + c10::cuda::compat::exp(out_values_row[k]) * tmp_row; + } else { + values_row[k] = out_values_row[k] * tmp_row; + } + } + } + } + index += step; + } +} + +using thrust_ptr = thrust::device_ptr; + +Tensor get_offsets( + const Tensor& indices, + const IntArrayRef& sizes, + const int64_t dim) { + /* + See ATen/native/sparse/Softmax.cpp:get_offsets for the CPU + implementation of get_offsets function that this implementation is based on. + */ + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto ndim = indices.size(0); + auto nnz = indices.size(1); + std::vector host_strides(ndim, 1); + if (ndim > 1) { + for (int64_t i = ndim - 2; i >= 0; i--) { + host_strides[i] = + host_strides[i + 1] * (i + 1 == dim ? 1 : sizes[i + 1]); + } + } + auto strides = at::empty({ndim}, indices.options()); + auto strides_ptr = strides.data_ptr(); + + AT_CUDA_CHECK(cudaMemcpyAsync( + strides_ptr, host_strides.data(), host_strides.size() * sizeof(int64_t), + cudaMemcpyHostToDevice, + stream)); + + auto indices_accessor = indices.packed_accessor(); + + Tensor offsets = at::empty({nnz}, indices.options()); + + thrust::transform( + policy, + thrust::make_counting_iterator(int64_t(0)), + thrust::make_counting_iterator(int64_t(nnz)), + thrust::device_ptr(offsets.data_ptr()), + [indices_accessor, strides_ptr, dim, ndim] __device__(int64_t x) { + int64_t pool_index = 0; + for (int64_t j = 0; j < ndim; j++) { + if (j != dim) { + auto indices_row = indices_accessor[j]; + auto stride = strides_ptr[j]; + pool_index += stride * indices_row[x]; + } + } + return pool_index; + }); + return offsets; +} + +template +std::tuple compute_pool_max( + const Tensor& indices, + const Tensor& values, + const IntArrayRef& sizes, + int64_t nvalues, + const int64_t dim) { + /* + Return pools of indices that align with the given dimension and the + corresponding max values for each pool. + + See ATen/native/sparse/Softmax.cpp:get_offsets and + ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU + implementation that this implementation is based on. + */ + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto nnz = indices.size(1); + auto offsets = get_offsets(indices, sizes, dim); + int64_t* offsets_ptr = offsets.data_ptr(); + + auto sorted_indices = at::empty({nnz}, indices.options()); + thrust_ptr sorted_indices_thrust_ptr(sorted_indices.data_ptr()); + thrust::sequence( + policy, sorted_indices_thrust_ptr, sorted_indices_thrust_ptr + nnz, 0); + + thrust::sort( + policy, + sorted_indices_thrust_ptr, + sorted_indices_thrust_ptr + nnz, + [offsets_ptr] __device__(int64_t x, int64_t y) { + return offsets_ptr[x] < offsets_ptr[y]; + }); + auto pool_sizes = at::empty({nnz}, indices.options()); + + auto new_end = thrust::reduce_by_key( + policy, + sorted_indices_thrust_ptr, + sorted_indices_thrust_ptr + nnz, + thrust::make_constant_iterator(int64_t(1)), + thrust::make_discard_iterator(), + thrust_ptr(pool_sizes.data_ptr()), + [offsets_ptr] __device__(int64_t x, int64_t y) { + return offsets_ptr[x] == offsets_ptr[y]; + }); + auto new_sz = thrust::distance( + thrust_ptr(pool_sizes.data_ptr()), new_end.second); + pool_sizes.resize_({new_sz}); + + auto pool_offsets = pool_sizes.clone(); + thrust_ptr pool_offsets_thrust_ptr( + pool_offsets.data_ptr()); + thrust::exclusive_scan( + policy, + pool_offsets_thrust_ptr, + pool_offsets_thrust_ptr + new_sz, + pool_offsets_thrust_ptr); + + Tensor mx_buffer; + if (requireMxRows) { + + auto values_accessor = + values.packed_accessor(); // {nnz, nvalues} + + mx_buffer = at::full({new_sz * nvalues}, Scalar(-std::numeric_limits::infinity()), values.options()); + + auto mx_buffer_ptr = mx_buffer.data_ptr(); + + auto pool_sizes_ptr = pool_sizes.data_ptr(); + auto sorted_indices_ptr = sorted_indices.data_ptr(); + auto pool_offsets_ptr = pool_offsets.data_ptr(); + + thrust::for_each( + policy, + thrust::make_counting_iterator(int64_t(0)), + thrust::make_counting_iterator(int64_t(new_sz)), + [values_accessor, + sorted_indices_ptr, + pool_sizes_ptr, + pool_offsets_ptr, + mx_buffer_ptr, + nvalues] __device__(int64_t index) { + int64_t curr_pool_size = pool_sizes_ptr[index]; + auto mx_row = mx_buffer_ptr + index * nvalues; + int64_t offset = pool_offsets_ptr[index]; + for (int64_t p = 0; p < curr_pool_size; p++) { + int64_t i = *(sorted_indices_ptr + offset + p); + auto values_row = values_accessor[i].data(); + for (int64_t j = 0; j < nvalues; j++) { + mx_row[j] = c10::cuda::compat::max(mx_row[j], values_row[j]); + } + } + }); + } + return std::make_tuple( + sorted_indices, pool_offsets, pool_sizes, mx_buffer); +} + +template +void cuda_sparse_coo_softmax( + Tensor& output, + const Tensor& input, + const int64_t dim) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax for the CPU + implementation of the sparse softmax algorithm that this implementation is + based on. + */ + auto sparse_dim = input.sparse_dim(); + auto indices = input._indices().contiguous(); + auto values = input._values().contiguous(); + auto out_values = output._values(); + auto out_indices = output._indices(); + out_values.resize_as_(values); + out_indices.resize_as_(indices); + out_indices.copy_(indices); + + if (dim >= sparse_dim) { + if (LogSoftMax) { + auto new_values = log_softmax_cuda(values, dim - sparse_dim + 1, false); + out_values.set_(new_values); + } else { + auto new_values = softmax_cuda(values, dim - sparse_dim + 1, false); + out_values.set_(new_values); + } + return; + } + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + auto nnz = values.size(0); + auto sizes = input.sizes(); + auto nvalues = values.numel() / nnz; + + /* Prepare accessors */ + auto values_2 = values.view({nnz, nvalues}); + auto values_accessor = values_2.packed_accessor(); + + auto out_values_2 = out_values.view({nnz, nvalues}); + auto out_values_accessor = out_values_2.packed_accessor(); + + Tensor sorted_indices; + Tensor pool_offsets; + Tensor pool_sizes; + Tensor mx_buffer; + + std::tie(sorted_indices, pool_offsets, pool_sizes, mx_buffer) = + compute_pool_max(indices, values_2, sizes, nvalues, dim); + + auto pool_size = pool_offsets.size(0); + int block_size = getNumThreads(pool_size); + const int grid_size = (pool_size + block_size - 1) / block_size; + + cuda_sparse_coo_softmax_kernel + <<>>( + sorted_indices.data_ptr(), + pool_size, + pool_sizes.data_ptr(), + pool_offsets.data_ptr(), + nvalues, + mx_buffer.data_ptr(), + values_accessor, + out_values_accessor); + THCudaCheck(cudaGetLastError()); +} + +template +void cuda_sparse_coo_softmax_backward( + Tensor& grad_input, + const Tensor& grad, + const Tensor& output, + const int64_t dim) { + /* + See ATen/native/sparse/Softmax.cpp:cpu_sparse_coo_softmax_backward for + the CPU implementation of the sparse softmax backward algorithm that this + implementation is based on. + */ + auto sparse_dim = output.sparse_dim(); + auto sizes = output.sizes().vec(); + auto grad_indices = grad._indices().contiguous(); + auto grad_values = grad._values().contiguous(); + auto out_indices = output._indices().contiguous(); + auto out_values = output._values().contiguous(); + auto values = grad_input._values(); + auto indices = grad_input._indices(); + auto out_nnz = out_values.size(0); + auto grad_nnz = grad_values.size(0); + + values.resize_as_(out_values); + values.zero_(); + indices.resize_as_(out_indices); + indices.copy_(out_indices); + + auto out_offsets = get_offsets(out_indices, sizes, -1); + auto grad_offsets = get_offsets(grad_indices, sizes, -1); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + auto allocator = THCThrustAllocator(globalContext().lazyInitCUDA()); + auto policy = thrust::cuda::par(allocator).on(stream); + + /* when dim >= sparse_dim the dense backward is used */ + if (dim >= sparse_dim) { + if (at::native::cuda_equal(out_offsets, grad_offsets) == true) { + Tensor unused = at::native::empty_like(grad_values); + if (LogSoftMax) { + auto r = log_softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused); + values.set_(r); + } else { + auto r = softmax_backward_cuda(grad_values, out_values, dim - sparse_dim + 1, unused); + values.set_(r); + } + } else { + auto host_out_offsets = + out_offsets.to(at::Device(kCPU), indices.dtype(), false, true); + auto host_grad_offsets = + grad_offsets.to(at::Device(kCPU), indices.dtype(), false, true); + auto out_offsets_accessor = host_out_offsets.data_ptr(); + auto grad_offsets_accessor = host_grad_offsets.data_ptr(); + for (int64_t i = 0; i < out_nnz; i++) { + Tensor unused = at::native::empty_like(grad_values); + auto low = thrust::lower_bound( + grad_offsets_accessor, + grad_offsets_accessor + grad_offsets.size(0), + out_offsets_accessor[i]); + auto j = low - grad_offsets_accessor; + /* + Compute output using dense backward only when limits and pools are valid + If this check is false then a sparse tensor with full of zeros is returned + */ + if (j < grad_nnz && out_offsets_accessor[i] == grad_offsets_accessor[j]) { + if (LogSoftMax) { + auto r = log_softmax_backward_cuda( + grad_values[j], out_values[i], dim - sparse_dim, unused); + values[i].copy_(r); + } else { + auto r = softmax_backward_cuda( + grad_values[j], out_values[i], dim - sparse_dim, unused); + values[i].copy_(r); + } + } + } + } + return; + } + + auto nnz = values.size(0); + auto nvalues = values.numel() / nnz; + + auto values_2 = values.view({nnz, nvalues}); + auto values_accessor = values_2.packed_accessor(); + + auto out_values_2 = out_values.view({out_nnz, nvalues}); + auto out_values_accessor = out_values_2.packed_accessor(); + + auto grad_values_2 = grad_values.view({grad_nnz, nvalues}); + auto grad_values_accessor = grad_values_2.packed_accessor(); + + Tensor lower_bound_values = + at::empty({out_offsets.size(0)}, indices.options()); + + thrust::lower_bound( + policy, + thrust_ptr(grad_offsets.data_ptr()), + thrust_ptr(grad_offsets.data_ptr() + grad_offsets.size(0)), + thrust_ptr(out_offsets.data_ptr()), + thrust_ptr(out_offsets.data_ptr()) + out_offsets.size(0), + thrust_ptr(lower_bound_values.data_ptr())); + + Tensor sorted_indices; + Tensor pool_offsets; + Tensor pool_sizes; + + /* Compute independent pools of indices */ + std::tie( + sorted_indices, pool_offsets, pool_sizes, std::ignore) = + compute_pool_max( + out_indices, values_2, sizes, nvalues, dim); + + auto pool_size = pool_offsets.size(0); + + int block_size = getNumThreads(pool_size); + const int grid_size = (pool_size + block_size - 1) / block_size; + + cuda_sparse_coo_softmax_backward_kernel + <<>>( + sorted_indices.data_ptr(), + pool_size, + pool_sizes.data_ptr(), + pool_offsets.data_ptr(), + nvalues, + grad_nnz, + grad_offsets.data_ptr(), + out_offsets.data_ptr(), + lower_bound_values.data_ptr(), + values_accessor, + out_values_accessor, + grad_values_accessor); + THCudaCheck(cudaGetLastError()); +} + +} // end anonymous namespace + +Tensor softmax_sparse_cuda( + const Tensor& input_, + const int64_t dim, + const bool half_to_float) { + Tensor input, output; + std::tie(input, output) = softmax_sparse_input_preprocessing( + input_, dim, half_to_float, "softmax"); + if (input.numel() == 0) { + return output; + } + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "softmax", [&] { + cuda_sparse_coo_softmax(output, input, dim); + }); + return output; +} + +Tensor log_softmax_sparse_cuda( + const Tensor& input_, + const int64_t dim, + const bool half_to_float) { + Tensor input, output; + std::tie(input, output) = softmax_sparse_input_preprocessing( + input_, dim, half_to_float, "log_softmax"); + if (input.numel() == 0) { + return output; + } + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_softmax", [&] { + cuda_sparse_coo_softmax(output, input, dim); + }); + return output; +} + +Tensor softmax_backward_sparse_cuda( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_) { + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "softmax_backward"); + if (output.numel() == 0) { + return grad_input; + } + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "softmax_backward", [&] { + cuda_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); + }); + return grad_input; +} + +Tensor log_softmax_backward_sparse_cuda( + const Tensor& grad_, + const Tensor& output_, + int64_t dim_, + const Tensor& input_) { + Tensor grad_input, grad, output; + std::tie(grad_input, grad, output) = + softmax_backward_sparse_input_preprocessing( + grad_, output_, dim_, input_, "log_softmax_backward"); + if (output.numel() == 0) { + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "log_softmax_backward", [&] { + cuda_sparse_coo_softmax_backward( + grad_input, grad, output, dim_); + }); + return grad_input; +} + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/Vulkan.h b/aten/src/ATen/native/vulkan/Vulkan.h index df9a53f7076d..c2b1775e8f0a 100644 --- a/aten/src/ATen/native/vulkan/Vulkan.h +++ b/aten/src/ATen/native/vulkan/Vulkan.h @@ -456,7 +456,7 @@ class ComputeUnit final { void createComputePipelineCompile( const std::string& glslSrc, const VkPipelineCache pipelineCache, - const VkDescriptorSetLayout& descrSetLayout, + const VkDescriptorSetLayout descrSetLayout, const WorkGroupSize workGroupSize); #endif diff --git a/aten/src/ATen/native/vulkan/api/Adapter.h b/aten/src/ATen/native/vulkan/api/Adapter.h new file mode 100644 index 000000000000..239edfb74518 --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Adapter.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +// +// A Vulkan Adapter represents a physical device and its properties. Adapters +// are enumerated through the Runtime and are used in creation of Contexts. +// Each tensor in PyTorch is associated with a Context to make the +// device <-> tensor affinity explicit. +// + +struct Adapter final { + Runtime* runtime; + VkPhysicalDevice handle; + VkPhysicalDeviceProperties properties; + VkPhysicalDeviceMemoryProperties memory_properties; + uint32_t compute_queue_family_index; + + inline bool has_unified_memory() const { + // Ideally iterate over all memory types to see if there is a pool that + // is both host-visible, and device-local. This should be a good proxy + // for now. + return VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU == properties.deviceType; + } +}; + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Allocator.h b/aten/src/ATen/native/vulkan/api/Allocator.h index afa720a515e6..f0f0c9baa59c 100644 --- a/aten/src/ATen/native/vulkan/api/Allocator.h +++ b/aten/src/ATen/native/vulkan/api/Allocator.h @@ -2,11 +2,19 @@ #include +#ifdef DEBUG + #define VMA_DEBUG_LOG(format, ...) \ + do { \ + printf(format, ##__VA_ARGS__); \ + printf("\n"); \ + } while(false) +#endif /* DEBUG */ + #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnullability-completeness" #pragma clang diagnostic ignored "-Wunused-variable" -#endif +#endif /* __clang__ */ // Do NOT include vk_mem_alloc.h directly. // Always include this file (Allocator.h) instead. @@ -15,4 +23,4 @@ #ifdef __clang__ #pragma clang diagnostic pop -#endif +#endif /* __clang__ */ diff --git a/aten/src/ATen/native/vulkan/api/Command.cpp b/aten/src/ATen/native/vulkan/api/Command.cpp index 21279b408233..a7793aea16dc 100644 --- a/aten/src/ATen/native/vulkan/api/Command.cpp +++ b/aten/src/ATen/native/vulkan/api/Command.cpp @@ -5,12 +5,15 @@ namespace native { namespace vulkan { namespace api { -Command::Pool::Factory::Factory(const VkDevice device) - : device_(device) { +Command::Pool::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( - const Descriptor& descriptor) const { + const Descriptor& descriptor) const { const VkCommandPoolCreateInfo command_pool_create_info{ VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, nullptr, @@ -20,7 +23,14 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( VkCommandPool command_pool{}; VK_CHECK(vkCreateCommandPool( - device_, &command_pool_create_info, nullptr, &command_pool)); + device_, + &command_pool_create_info, + nullptr, + &command_pool)); + + TORCH_CHECK( + command_pool, + "Invalid Vulkan command pool!"); return Handle{ command_pool, @@ -31,8 +41,13 @@ typename Command::Pool::Factory::Handle Command::Pool::Factory::operator()( void Command::Pool::purge( const VkDevice device, const VkCommandPool command_pool) { - TORCH_INTERNAL_ASSERT(device, "Invalid Vulkan device!"); - TORCH_INTERNAL_ASSERT(command_pool, "Invalid Vulkan command pool!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_pool, + "Invalid Vulkan command pool!"); VK_CHECK(vkResetCommandPool(device, command_pool, 0u)); } @@ -42,6 +57,14 @@ namespace { VkCommandBuffer allocate_command_buffer( const VkDevice device, const VkCommandPool command_pool) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_pool, + "Invalid Vulkan command pool!"); + const VkCommandBufferAllocateInfo command_buffer_allocate_info{ VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, nullptr, @@ -52,7 +75,13 @@ VkCommandBuffer allocate_command_buffer( VkCommandBuffer command_buffer{}; VK_CHECK(vkAllocateCommandBuffers( - device, &command_buffer_allocate_info, &command_buffer)); + device, + &command_buffer_allocate_info, + &command_buffer)); + + TORCH_CHECK( + command_buffer, + "Invalid Vulkan command buffer!"); return command_buffer; } @@ -61,6 +90,9 @@ VkCommandBuffer allocate_command_buffer( Command::Buffer::Buffer(const VkDevice device, const VkCommandPool command_pool) : command_buffer_(allocate_command_buffer(device, command_pool)) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + command_buffer_, + "Invalid Vulkan command buffer!"); } void Command::Buffer::Buffer::begin() { @@ -71,7 +103,9 @@ void Command::Buffer::Buffer::begin() { nullptr, }; - VK_CHECK(vkBeginCommandBuffer(command_buffer_, &command_buffer_begin_info)); + VK_CHECK(vkBeginCommandBuffer( + command_buffer_, + &command_buffer_begin_info)); } void Command::Buffer::Buffer::end() { @@ -79,16 +113,26 @@ void Command::Buffer::Buffer::end() { } void Command::Buffer::bind(const VkPipeline pipeline) { - TORCH_INTERNAL_ASSERT(pipeline, "Invalid Vulkan pipeline!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + pipeline, + "Invalid Vulkan pipeline!"); - vkCmdBindPipeline(command_buffer_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + vkCmdBindPipeline( + command_buffer_, + VK_PIPELINE_BIND_POINT_COMPUTE, + pipeline); } void Command::Buffer::bind( const VkPipelineLayout pipeline_layout, const VkDescriptorSet descriptor_set) { - TORCH_INTERNAL_ASSERT(pipeline_layout, "Invalid Vulkan pipeline layout!"); - TORCH_INTERNAL_ASSERT(descriptor_set, "Invalid Vulkan descriptor set!"); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + pipeline_layout, + "Invalid Vulkan pipeline layout!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_set, + "Invalid Vulkan descriptor set!"); vkCmdBindDescriptorSets( command_buffer_, diff --git a/aten/src/ATen/native/vulkan/api/Command.h b/aten/src/ATen/native/vulkan/api/Command.h index 462a50fef7fd..b0c171faa490 100644 --- a/aten/src/ATen/native/vulkan/api/Command.h +++ b/aten/src/ATen/native/vulkan/api/Command.h @@ -9,7 +9,7 @@ namespace native { namespace vulkan { namespace api { -struct C10_EXPORT Command final { +struct Command final { // // Pool // @@ -29,7 +29,7 @@ struct C10_EXPORT Command final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Pool::Descriptor Descriptor; typedef VK_DELETER(CommandPool) Deleter; @@ -52,8 +52,8 @@ struct C10_EXPORT Command final { typedef api::Cache Cache; Cache cache; - explicit Pool(const VkDevice device) - : cache(Factory(device)) { + explicit Pool(const GPU& gpu) + : cache(Factory(gpu)) { } static void purge(VkDevice device, VkCommandPool command_pool); @@ -78,8 +78,8 @@ struct C10_EXPORT Command final { VkCommandBuffer command_buffer_; }; - explicit Command(const VkDevice device) - : pool(device) { + explicit Command(const GPU& gpu) + : pool(gpu) { } }; diff --git a/aten/src/ATen/native/vulkan/api/Common.h b/aten/src/ATen/native/vulkan/api/Common.h index 0c1e7cc4720b..cbd53e8045ef 100644 --- a/aten/src/ATen/native/vulkan/api/Common.h +++ b/aten/src/ATen/native/vulkan/api/Common.h @@ -24,10 +24,10 @@ at::native::vulkan::api::destroy_##Handle #define VK_DELETER_DISPATCHABLE_DECLARE(Handle) \ - C10_EXPORT void destroy_##Handle(const Vk##Handle handle) + void destroy_##Handle(const Vk##Handle handle) #define VK_DELETER_NON_DISPATCHABLE_DECLARE(Handle) \ - class C10_EXPORT destroy_##Handle final { \ + class destroy_##Handle final { \ public: \ explicit destroy_##Handle(const VkDevice device); \ void operator()(const Vk##Handle handle) const; \ @@ -40,6 +40,21 @@ namespace native { namespace vulkan { namespace api { +struct Adapter; +struct Command; +class Context; +struct Descriptor; +struct Pipeline; +struct Resource; +class Runtime; +struct Shader; + +struct GPU final { + const Adapter* adapter; + VkDevice device; + VkQueue queue; +}; + VK_DELETER_DISPATCHABLE_DECLARE(Instance); VK_DELETER_DISPATCHABLE_DECLARE(Device); VK_DELETER_NON_DISPATCHABLE_DECLARE(Semaphore); @@ -78,11 +93,13 @@ class Handle final { Handle(const Handle&) = delete; Handle& operator=(const Handle&) = delete; Handle(Handle&&); - Handle& operator=(Handle&&); + Handle& operator=(Handle&&) &; + Handle& operator=(Handle&&) && = delete; ~Handle(); operator bool() const; - Type get() const; + Type get() const &; + Type get() const && = delete; Type release(); void reset(Type payload = kNull); @@ -112,7 +129,7 @@ inline Handle::Handle(Handle&& handle) template inline Handle& -Handle::operator=(Handle&& handle) +Handle::operator=(Handle&& handle) & { reset(handle.release()); deleter_ = std::move(handle.deleter_); @@ -130,7 +147,7 @@ inline Handle::operator bool() const { } template -inline Type Handle::get() const { +inline Type Handle::get() const & { return payload_; } diff --git a/aten/src/ATen/native/vulkan/api/Context.cpp b/aten/src/ATen/native/vulkan/api/Context.cpp index 76a245e16d38..d0fa08dbde1d 100644 --- a/aten/src/ATen/native/vulkan/api/Context.cpp +++ b/aten/src/ATen/native/vulkan/api/Context.cpp @@ -8,208 +8,31 @@ namespace vulkan { namespace api { namespace { -struct Configuration final { -#ifndef DEBUG - static constexpr bool kEnableValidationLayers = false; -#else - static constexpr bool kEnableValidationLayers = true; -#endif -}; - -VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( - const VkDebugReportFlagsEXT flags, - const VkDebugReportObjectTypeEXT /* object_type */, - const uint64_t /* object */, - const size_t /* location */, - const int32_t message_code, - const char* const layer_prefix, - const char* const message, - void* const /* user_data */) { - std::stringstream stream; - stream << layer_prefix << " " << message_code << " " << message << std::endl; - const std::string log = stream.str(); - - if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) { - LOG(ERROR) << log; - } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) { - LOG(WARNING) << log; - } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) { - LOG(WARNING) << "Performance:" << log; - } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { - LOG(INFO) << log; - } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { - LOG(INFO) << "Debug: " << log; - } - - return VK_FALSE; -} - -VkInstance create_instance(const bool enable_validation_layers) { - std::vector enabled_instance_layers; - std::vector enabled_instance_extensions; - - if (enable_validation_layers) { - uint32_t instance_layers_count = 0; - VK_CHECK(vkEnumerateInstanceLayerProperties( - &instance_layers_count, nullptr)); - - std::vector instance_layer_properties( - instance_layers_count); - - VK_CHECK(vkEnumerateInstanceLayerProperties( - &instance_layers_count, - instance_layer_properties.data())); - - constexpr const char* const requested_instance_layers[]{ - // "VK_LAYER_LUNARG_api_dump", - "VK_LAYER_KHRONOS_validation", - }; - - for (const auto& requested_instance_layer : requested_instance_layers) { - for (const auto& layer : instance_layer_properties) { - if (strcmp(requested_instance_layer, layer.layerName) == 0) { - enabled_instance_layers.push_back(requested_instance_layer); - break; - } - } - } - - uint32_t instance_extension_count = 0; - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &instance_extension_count, nullptr)); - - std::vector instance_extension_properties( - instance_extension_count); - - VK_CHECK(vkEnumerateInstanceExtensionProperties( - nullptr, &instance_extension_count, instance_extension_properties.data())); - - constexpr const char* const requested_instance_extensions[]{ - VK_EXT_DEBUG_REPORT_EXTENSION_NAME, - }; +Context* initialize() { + static const std::unique_ptr context([]() -> Context* { + try { + const Adapter adapter = runtime()->select([](const Adapter& adapter) { + // Select the first adapter. + return true; + }); - for (const auto& requested_instance_extension : requested_instance_extensions) { - for (const auto& extension : instance_extension_properties) { - if (strcmp(requested_instance_extension, extension.extensionName) == 0) { - enabled_instance_extensions.push_back(requested_instance_extension); - break; - } - } + return new Context(adapter); } - } - - constexpr VkApplicationInfo application_info{ - VK_STRUCTURE_TYPE_APPLICATION_INFO, - nullptr, - "PyTorch", - 0, - "PyTorch", - 0, - VK_API_VERSION_1_0, - }; - - const VkInstanceCreateInfo instance_create_info{ - VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, - nullptr, - 0u, - &application_info, - static_cast(enabled_instance_layers.size()), - enabled_instance_layers.data(), - static_cast(enabled_instance_extensions.size()), - enabled_instance_extensions.data(), - }; - - VkInstance instance{}; - VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); - - return instance; -} - -VkDebugReportCallbackEXT create_debug_report_callback( - const VkInstance instance, - const bool enable_validation_layers) { - if (!enable_validation_layers) { - return VkDebugReportCallbackEXT{}; - } - - const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{ - VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, - nullptr, - VK_DEBUG_REPORT_INFORMATION_BIT_EXT | - VK_DEBUG_REPORT_WARNING_BIT_EXT | - VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | - VK_DEBUG_REPORT_ERROR_BIT_EXT | - VK_DEBUG_REPORT_DEBUG_BIT_EXT, - debug_report_callback_fn, - nullptr, - }; - - const auto vkCreateDebugReportCallbackEXT = - (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance, "vkCreateDebugReportCallbackEXT"); - - TORCH_CHECK( - vkCreateDebugReportCallbackEXT, - "Could not load vkCreateDebugReportCallbackEXT"); - - VkDebugReportCallbackEXT debug_report_callback{}; - VK_CHECK(vkCreateDebugReportCallbackEXT( - instance, - &debugReportCallbackCreateInfo, - nullptr, - &debug_report_callback)); - - return debug_report_callback; -} - -VkPhysicalDevice acquire_physical_device(const VkInstance instance) { - uint32_t device_count = 0; - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); - TORCH_CHECK(device_count > 0, "Vulkan: Could not find a device with Vulkan support!"); - - std::vector devices(device_count); - VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); - - return devices[0]; -} - -VkPhysicalDeviceLimits query_physical_device_physical_device_limits( - const VkPhysicalDevice physical_device) { - VkPhysicalDeviceProperties physical_device_properties{}; - vkGetPhysicalDeviceProperties(physical_device, &physical_device_properties); - return physical_device_properties.limits; -} - -uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) { - uint32_t queue_family_count = 0; - - vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, &queue_family_count, nullptr); - - TORCH_CHECK( - queue_family_count > 0, "Vulkan: Invalid number of queue families!"); - - std::vector queue_families_properties( - queue_family_count); - - vkGetPhysicalDeviceQueueFamilyProperties( - physical_device, &queue_family_count, queue_families_properties.data()); - - for (uint32_t i = 0; i < queue_families_properties.size(); ++i) { - const VkQueueFamilyProperties& properties = queue_families_properties[i]; - if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) { - return i; + catch (...) { + return nullptr; } - } + }()); - TORCH_CHECK( - false, - "Vulkan: Could not find a queue family that supports compute operations!"); + return context.get(); } VkDevice create_device( const VkPhysicalDevice physical_device, const uint32_t compute_queue_family_index) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + const float queue_priorities = 1.0f; const VkDeviceQueueCreateInfo device_queue_create_info{ VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, @@ -234,6 +57,7 @@ VkDevice create_device( VkDevice device{}; VK_CHECK(vkCreateDevice(physical_device, &device_create_info, nullptr, &device)); + TORCH_CHECK(device, "Invalid Vulkan device!"); return device; } @@ -241,79 +65,45 @@ VkDevice create_device( VkQueue acquire_queue( const VkDevice device, const uint32_t compute_queue_family_index) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + VkQueue queue{}; vkGetDeviceQueue(device, compute_queue_family_index, 0, &queue); + TORCH_CHECK(queue, "Invalid Vulkan queue!"); + return queue; } } // namespace -Context::Context(const bool enable_validation_layers) - : instance_(create_instance(enable_validation_layers), &VK_DELETER(Instance)), - debug_report_callback_( - create_debug_report_callback(instance(), enable_validation_layers), - Debug(instance())), - physical_device_(acquire_physical_device(instance())), - physical_device_limits_(query_physical_device_physical_device_limits(physical_device())), - compute_queue_family_index_(query_compute_queue_family_index(physical_device())), - device_(create_device(physical_device(), compute_queue_family_index_), &VK_DELETER(Device)), - queue_(acquire_queue(device(), compute_queue_family_index_)), - command_(device()), - shader_(device()), - pipeline_(device()), - descriptor_(device()), - resource_(instance(), physical_device(), device()) { -} - -Context::Debug::Debug(const VkInstance instance) - : instance_(instance) { -} - -void Context::Debug::operator()( - const VkDebugReportCallbackEXT debug_report_callback) const { - if (debug_report_callback) { - const auto vkDestroyDebugReportCallbackEXT = - (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( - instance_, "vkDestroyDebugReportCallbackEXT"); - - TORCH_CHECK( - vkDestroyDebugReportCallbackEXT, - "Could not load vkDestroyDebugReportCallbackEXT"); - - vkDestroyDebugReportCallbackEXT( - instance_, debug_report_callback, nullptr); - } -} - -Context* initialize() { - static const std::unique_ptr context([]() -> Context* { -#ifdef USE_VULKAN_WRAPPER - if (!InitVulkan()) { - TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan"); - return nullptr; - } -#endif - - try { - return new Context(Configuration::kEnableValidationLayers); - } - catch (...) { - return nullptr; - } - }()); - - return context.get(); +void Context::Deleter::operator()(const VkDevice device) const { + // No VK_CHECK. Don't want an exception thrown in the destructor. + vkDeviceWaitIdle(device); + vkDestroyDevice(device, nullptr); } -bool available() { - return initialize(); +Context::Context(const Adapter& adapter) + : adapter_(adapter), + device_( + create_device( + adapter.handle, + adapter.compute_queue_family_index), + Deleter{}), + queue_(acquire_queue(device(), adapter.compute_queue_family_index)), + command_(gpu()), + shader_(gpu()), + pipeline_(gpu()), + descriptor_(gpu()), + resource_(gpu()) { } -Context& context() { +Context* context() { Context* const context = initialize(); TORCH_CHECK(context, "Vulkan: Backend not available on this platform!"); - return *context; + return context; } } // namespace api diff --git a/aten/src/ATen/native/vulkan/api/Context.h b/aten/src/ATen/native/vulkan/api/Context.h index d57eab66108e..5d593bdd9bc1 100644 --- a/aten/src/ATen/native/vulkan/api/Context.h +++ b/aten/src/ATen/native/vulkan/api/Context.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -14,34 +15,29 @@ namespace api { // // Vulkan Context holds onto all relevant Vulkan state as it pertains to our -// use of Vulkan in PyTorch. The context is currently a global object, but -// technically it does not need to be if we were to make it explicit to the -// user. +// use of Vulkan in PyTorch. A Context is associated with one, and only one, +// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch +// are associated with a Context to make tensor <-> device affinity explicit. +// The context is currently a global object, but technically it does not need +// to be if we were to make it explicit to the user. // -class C10_EXPORT Context final { +class Context final { public: - explicit Context(bool enable_validation_layers); + explicit Context(const Adapter& adapter); + Context(const Context&) = delete; + Context(Context&&) = default; + Context& operator=(const Context&) = delete; + Context& operator=(Context&&) = default; ~Context() = default; - inline VkInstance instance() const { - return instance_.get(); - } - - inline VkPhysicalDevice physical_device() const { - return physical_device_; - } - - inline const VkPhysicalDeviceLimits& physical_device_limits() const { - return physical_device_limits_; - } - - inline VkDevice device() const { - return device_.get(); - } - - inline VkQueue queue() const { - return queue_; + inline GPU gpu() { + // A GPU is simply a (physical device, logical device, device queue) trio. + return { + &adapter_, + device(), + queue(), + }; } inline Command& command() { @@ -65,23 +61,26 @@ class C10_EXPORT Context final { } private: - class Debug final { - public: - explicit Debug(VkInstance instance); - void operator()(VkDebugReportCallbackEXT debug_report_callback) const; + inline VkDevice device() { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_); + return device_.get(); + } - private: - VkInstance instance_; + inline VkQueue queue() { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(queue_); + return queue_; + } + + private: + class Deleter final { + public: + void operator()(VkDevice device) const; }; private: // Construction and destruction order matters. Do not move members around. - Handle instance_; - Handle debug_report_callback_; - VkPhysicalDevice physical_device_; - VkPhysicalDeviceLimits physical_device_limits_; - uint32_t compute_queue_family_index_; - Handle device_; + Adapter adapter_; + Handle device_; VkQueue queue_; Command command_; Shader shader_; @@ -90,8 +89,7 @@ class C10_EXPORT Context final { Resource resource_; }; -C10_EXPORT bool available(); -C10_EXPORT Context& context(); +Context* context(); } // namespace api } // namespace vulkan diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.cpp b/aten/src/ATen/native/vulkan/api/Descriptor.cpp index 1b5ea94341a3..ff0505ccebca 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.cpp +++ b/aten/src/ATen/native/vulkan/api/Descriptor.cpp @@ -44,12 +44,15 @@ const Descriptor::Pool::Descriptor Descriptor::Pool::kDefault{ }, }; -Descriptor::Pool::Factory::Factory(const VkDevice device) - : device_(device) { +Descriptor::Pool::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); } typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator()( - const Descriptor& descriptor) const { + const Descriptor& descriptor) const { const VkDescriptorPoolCreateInfo descriptor_pool_create_info{ VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, nullptr, @@ -61,7 +64,14 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator() VkDescriptorPool descriptor_pool{}; VK_CHECK(vkCreateDescriptorPool( - device_, &descriptor_pool_create_info, nullptr, &descriptor_pool)); + device_, + &descriptor_pool_create_info, + nullptr, + &descriptor_pool)); + + TORCH_CHECK( + descriptor_pool, + "Invalid Vulkan descriptor pool!"); return Handle{ descriptor_pool, @@ -72,12 +82,29 @@ typename Descriptor::Pool::Factory::Handle Descriptor::Pool::Factory::operator() void Descriptor::Pool::purge( const VkDevice device, const VkDescriptorPool descriptor_pool) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_pool, + "Invalid Vulkan descriptor pool!"); + VK_CHECK(vkResetDescriptorPool(device, descriptor_pool, 0u)); } -Descriptor::Factory::Factory(const VkDevice device, const VkDescriptorPool descriptor_pool) +Descriptor::Factory::Factory( + const VkDevice device, + const VkDescriptorPool descriptor_pool) : device_(device), descriptor_pool_(descriptor_pool) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor_pool, + "Invalid Vulkan descriptor pool!"); } VkDescriptorSet Descriptor::Factory::allocate( @@ -92,7 +119,13 @@ VkDescriptorSet Descriptor::Factory::allocate( VkDescriptorSet descriptor_set{}; VK_CHECK(vkAllocateDescriptorSets( - device_, &descriptor_set_allocate_info, &descriptor_set)); + device_, + &descriptor_set_allocate_info, + &descriptor_set)); + + TORCH_CHECK( + descriptor_set, + "Invalid Vulkan descriptor set!"); return descriptor_set; } diff --git a/aten/src/ATen/native/vulkan/api/Descriptor.h b/aten/src/ATen/native/vulkan/api/Descriptor.h index 3e339ae4641f..bc6c14723990 100644 --- a/aten/src/ATen/native/vulkan/api/Descriptor.h +++ b/aten/src/ATen/native/vulkan/api/Descriptor.h @@ -49,7 +49,7 @@ namespace api { // as well. This behavior is by design. // -struct C10_EXPORT Descriptor final { +struct Descriptor final { // // Pool // @@ -72,7 +72,7 @@ struct C10_EXPORT Descriptor final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Pool::Descriptor Descriptor; typedef VK_DELETER(DescriptorPool) Deleter; @@ -95,8 +95,8 @@ struct C10_EXPORT Descriptor final { typedef api::Cache Cache; Cache cache; - explicit Pool(const VkDevice device) - : cache(Factory(device)) { + explicit Pool(const GPU& gpu) + : cache(Factory(gpu)) { } static void purge(VkDevice device, VkDescriptorPool descriptor_pool); @@ -118,9 +118,9 @@ struct C10_EXPORT Descriptor final { VkDescriptorPool descriptor_pool_; } factory; - explicit Descriptor(const VkDevice device) - : pool(device), - factory(device, pool.cache.retrieve(Pool::kDefault)) { + explicit Descriptor(const GPU& gpu) + : pool(gpu), + factory(gpu.device, pool.cache.retrieve(Pool::kDefault)) { } }; @@ -156,8 +156,8 @@ inline size_t Descriptor::Pool::Factory::Hasher::operator()( } // namespace at inline bool operator==( - const VkDescriptorPoolSize& descriptor_pool_size_1, - const VkDescriptorPoolSize& descriptor_pool_size_2) { - return (descriptor_pool_size_1.type == descriptor_pool_size_2.type) && - (descriptor_pool_size_1.descriptorCount == descriptor_pool_size_2.descriptorCount); + const VkDescriptorPoolSize& _1, + const VkDescriptorPoolSize& _2) { + return (_1.type == _2.type) && + (_1.descriptorCount == _2.descriptorCount); } diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.cpp b/aten/src/ATen/native/vulkan/api/Pipeline.cpp index 303eea7cb401..bd9881c05443 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.cpp +++ b/aten/src/ATen/native/vulkan/api/Pipeline.cpp @@ -5,12 +5,19 @@ namespace native { namespace vulkan { namespace api { -Pipeline::Layout::Factory::Factory(const VkDevice device) - : device_(device) { +Pipeline::Layout::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator()( const Descriptor& descriptor) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor.descriptor_set_layout, + "Invalid Vulkan descriptor set layout!"); + const VkPipelineLayoutCreateInfo pipeline_layout_create_info{ VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, nullptr, @@ -23,7 +30,14 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator() VkPipelineLayout pipeline_layout{}; VK_CHECK(vkCreatePipelineLayout( - device_, &pipeline_layout_create_info, nullptr, &pipeline_layout)); + device_, + &pipeline_layout_create_info, + nullptr, + &pipeline_layout)); + + TORCH_CHECK( + pipeline_layout, + "Invalid Vulkan pipeline layout!"); return Handle{ pipeline_layout, @@ -34,6 +48,10 @@ typename Pipeline::Layout::Factory::Handle Pipeline::Layout::Factory::operator() namespace { VkPipelineCache create_pipeline_cache(const VkDevice device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + const VkPipelineCacheCreateInfo pipeline_cache_create_info{ VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, nullptr, @@ -44,20 +62,44 @@ VkPipelineCache create_pipeline_cache(const VkDevice device) { VkPipelineCache pipeline_cache{}; VK_CHECK(vkCreatePipelineCache( - device, &pipeline_cache_create_info, nullptr, &pipeline_cache)); + device, + &pipeline_cache_create_info, + nullptr, + &pipeline_cache)); + + TORCH_CHECK( + pipeline_cache, + "Invalid Vulkan pipeline cache!"); return pipeline_cache; } } // namespace -Pipeline::Factory::Factory(const VkDevice device) - : device_(device), - pipeline_cache_(create_pipeline_cache(device), VK_DELETER(PipelineCache)(device)) { +Pipeline::Factory::Factory(const GPU& gpu) + : device_(gpu.device), + pipeline_cache_( + create_pipeline_cache(device_), + VK_DELETER(PipelineCache)(device_)) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + pipeline_cache_, + "Invalid Vulkan pipeline cache!"); } typename Pipeline::Factory::Handle Pipeline::Factory::operator()( const Descriptor& descriptor) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor.pipeline_layout, + "Invalid Vulkan pipeline layout!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + descriptor.shader_module, + "Invalid Vulkan shader module!"); + constexpr uint32_t x_offset = 0u; constexpr uint32_t x_size = sizeof(Shader::WorkGroup::x); constexpr uint32_t y_offset = x_offset + x_size; @@ -113,7 +155,16 @@ typename Pipeline::Factory::Handle Pipeline::Factory::operator()( VkPipeline pipeline{}; VK_CHECK(vkCreateComputePipelines( - device_, pipeline_cache_.get(), 1u, &compute_pipeline_create_info, nullptr, &pipeline)); + device_, + pipeline_cache_.get(), + 1u, + &compute_pipeline_create_info, + nullptr, + &pipeline)); + + TORCH_CHECK( + pipeline, + "Invalid Vulkan pipeline!"); return Handle{ pipeline, diff --git a/aten/src/ATen/native/vulkan/api/Pipeline.h b/aten/src/ATen/native/vulkan/api/Pipeline.h index a5d72324c36e..c327a140eded 100644 --- a/aten/src/ATen/native/vulkan/api/Pipeline.h +++ b/aten/src/ATen/native/vulkan/api/Pipeline.h @@ -29,7 +29,7 @@ namespace api { // these Vulkan objects. // -struct C10_EXPORT Pipeline final { +struct Pipeline final { // // Layout // @@ -49,7 +49,7 @@ struct C10_EXPORT Pipeline final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Layout::Descriptor Descriptor; typedef VK_DELETER(PipelineLayout) Deleter; @@ -72,8 +72,8 @@ struct C10_EXPORT Pipeline final { typedef api::Cache Cache; Cache cache; - explicit Layout(const VkDevice device) - : cache(Factory(device)) { + explicit Layout(const GPU& gpu) + : cache(Factory(gpu)) { } } layout; @@ -93,7 +93,7 @@ struct C10_EXPORT Pipeline final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Pipeline::Descriptor Descriptor; typedef VK_DELETER(Pipeline) Deleter; @@ -117,9 +117,9 @@ struct C10_EXPORT Pipeline final { typedef api::Cache Cache; Cache cache; - explicit Pipeline(const VkDevice device) - : layout(device), - cache(Factory(device)) { + explicit Pipeline(const GPU& gpu) + : layout(gpu), + cache(Factory(gpu)) { } }; diff --git a/aten/src/ATen/native/vulkan/api/Resource.cpp b/aten/src/ATen/native/vulkan/api/Resource.cpp index c538a1b6e2d0..6969883cb183 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.cpp +++ b/aten/src/ATen/native/vulkan/api/Resource.cpp @@ -1,4 +1,5 @@ #include +#include namespace at { namespace native { @@ -10,6 +11,18 @@ VmaAllocator create_allocator( const VkInstance instance, const VkPhysicalDevice physical_device, const VkDevice device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + instance, + "Invalid Vulkan instance!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device, + "Invalid Vulkan device!"); + const VmaAllocatorCreateInfo allocator_create_info{ 0u, physical_device, @@ -27,6 +40,7 @@ VmaAllocator create_allocator( VmaAllocator allocator{}; VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator)); + TORCH_CHECK(allocator, "Invalid VMA allocator!"); return allocator; } @@ -46,6 +60,7 @@ VmaAllocationCreateInfo create_allocation_create_info( } void release_buffer(const Resource::Buffer& buffer) { + // Safe to pass null as buffer or allocation. vmaDestroyBuffer( buffer.memory.allocator, buffer.handle, @@ -59,6 +74,7 @@ void release_image(const Resource::Image& image) { vkDestroyImageView(allocator_info.device, image.view, nullptr); } + // Safe to pass null as image or allocation. vmaDestroyImage( image.memory.allocator, image.handle, @@ -87,6 +103,13 @@ Resource::Memory::Scope::Scope( : allocator_(allocator), allocation_(allocation), access_(access) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + allocator, + "Invalid VMA allocator!"); + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + allocation, + "Invalid VMA allocation!"); } void Resource::Memory::Scope::operator()(const void* const data) const { @@ -104,17 +127,20 @@ void Resource::Memory::Scope::operator()(const void* const data) const { } } -Resource::Pool::Pool( - const VkInstance instance, - const VkPhysicalDevice physical_device, - const VkDevice device) - : device_(device), - allocator_(create_allocator(instance, physical_device, device), vmaDestroyAllocator) { +Resource::Pool::Pool(const GPU& gpu) + : device_(gpu.device), + allocator_( + create_allocator( + gpu.adapter->runtime->instance(), + gpu.adapter->handle, + device_), + vmaDestroyAllocator) { buffers_.reserve(Configuration::kReserve); images_.reserve(Configuration::kReserve); } -Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) { +Resource::Buffer Resource::Pool::allocate( + const Buffer::Descriptor& descriptor) { const VkBufferCreateInfo buffer_create_info{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, nullptr, @@ -141,6 +167,9 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) &allocation, &allocation_info)); + TORCH_CHECK(buffer, "Invalid Vulkan buffer!"); + TORCH_CHECK(allocation, "Invalid VMA allocation!"); + buffers_.emplace_back( Buffer{ buffer, @@ -155,7 +184,8 @@ Resource::Buffer Resource::Pool::allocate(const Buffer::Descriptor& descriptor) return buffers_.back().get(); } -Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) { +Resource::Image Resource::Pool::allocate( + const Image::Descriptor& descriptor) { const VkImageCreateInfo image_create_info{ VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, nullptr, @@ -189,6 +219,9 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) { &allocation, &allocation_info)); + TORCH_CHECK(image, "Invalid Vulkan image!"); + TORCH_CHECK(allocation, "Invalid VMA allocation!"); + const VkImageViewCreateInfo image_view_create_info{ VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, nullptr, @@ -213,7 +246,14 @@ Resource::Image Resource::Pool::allocate(const Image::Descriptor& descriptor) { VkImageView view{}; VK_CHECK(vkCreateImageView( - device_, &image_view_create_info, nullptr, &view)) + device_, + &image_view_create_info, + nullptr, + &view)); + + TORCH_CHECK( + view, + "Invalid Vulkan image view!"); images_.emplace_back( Image{ diff --git a/aten/src/ATen/native/vulkan/api/Resource.h b/aten/src/ATen/native/vulkan/api/Resource.h index 04cd9a067663..00145ebe071f 100644 --- a/aten/src/ATen/native/vulkan/api/Resource.h +++ b/aten/src/ATen/native/vulkan/api/Resource.h @@ -8,7 +8,7 @@ namespace native { namespace vulkan { namespace api { -struct C10_EXPORT Resource final { +struct Resource final { /* Memory */ @@ -25,12 +25,25 @@ struct C10_EXPORT Resource final { template< typename Type, typename Pointer = std::add_pointer_t>> - Data map() const; + Data map() const &; template< typename Type, typename Pointer = std::add_pointer_t> - Data map(); + Data map() &; + + private: + // Intentionally disabed to ensure memory access is always properly + // encapsualted in a scoped map-unmap region. Allowing below overloads + // to be invoked on a temporary would open the door to the possibility + // of accessing the underlying memory out of the expected scope making + // for seemingly ineffective memory writes and hard to hunt down bugs. + + template + Data map() const && = delete; + + template + Data map() && = delete; }; /* @@ -95,10 +108,7 @@ struct C10_EXPORT Resource final { class Pool final { public: - Pool( - VkInstance instance, - VkPhysicalDevice physical_device, - VkDevice device); + explicit Pool(const GPU& gpu); Buffer allocate(const Buffer::Descriptor& descriptor); Image allocate(const Image::Descriptor& descriptor); @@ -115,11 +125,8 @@ struct C10_EXPORT Resource final { std::vector> images_; } pool; - Resource( - const VkInstance instance, - const VkPhysicalDevice physical_device, - const VkDevice device) - : pool(instance, physical_device, device) { + explicit Resource(const GPU& gpu) + : pool(gpu) { } }; @@ -144,7 +151,7 @@ class Resource::Memory::Scope final { }; template -inline Resource::Memory::Data Resource::Memory::map() const { +inline Resource::Memory::Data Resource::Memory::map() const & { void* map(const Memory& memory); return Data{ @@ -154,7 +161,7 @@ inline Resource::Memory::Data Resource::Memory::map() const { } template -inline Resource::Memory::Data Resource::Memory::map() { +inline Resource::Memory::Data Resource::Memory::map() & { void* map(const Memory& memory); return Data{ diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp new file mode 100644 index 000000000000..ce6e3b4231e4 --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp @@ -0,0 +1,343 @@ +#include +#include + +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { +namespace { + +struct Configuration final { +#ifndef DEBUG + static constexpr Runtime::Type kRuntime = Runtime::Type::Debug; +#else + static constexpr Runtime::Type kRuntime = Runtime::Type::Release; +#endif +}; + +VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn( + const VkDebugReportFlagsEXT flags, + const VkDebugReportObjectTypeEXT /* object_type */, + const uint64_t /* object */, + const size_t /* location */, + const int32_t message_code, + const char* const layer_prefix, + const char* const message, + void* const /* user_data */) { + std::stringstream stream; + stream << layer_prefix << " " << message_code << " " << message << std::endl; + const std::string log = stream.str(); + + if (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT) { + LOG(ERROR) << log; + } else if (flags & VK_DEBUG_REPORT_WARNING_BIT_EXT) { + LOG(WARNING) << log; + } else if (flags & VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT) { + LOG(WARNING) << "Performance:" << log; + } else if (flags & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { + LOG(INFO) << log; + } else if (flags & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { + LOG(INFO) << "Debug: " << log; + } + + return VK_FALSE; +} + +VkInstance create_instance(const Runtime::Type type) { + std::vector enabled_instance_layers; + std::vector enabled_instance_extensions; + + if (Runtime::Type::Debug == type) { + uint32_t instance_layers_count = 0; + VK_CHECK(vkEnumerateInstanceLayerProperties( + &instance_layers_count, nullptr)); + + std::vector instance_layer_properties( + instance_layers_count); + + VK_CHECK(vkEnumerateInstanceLayerProperties( + &instance_layers_count, + instance_layer_properties.data())); + + constexpr const char* const requested_instance_layers[]{ + // "VK_LAYER_LUNARG_api_dump", + "VK_LAYER_KHRONOS_validation", + }; + + for (const auto& requested_instance_layer : requested_instance_layers) { + for (const auto& layer : instance_layer_properties) { + if (strcmp(requested_instance_layer, layer.layerName) == 0) { + enabled_instance_layers.push_back(requested_instance_layer); + break; + } + } + } + + uint32_t instance_extension_count = 0; + VK_CHECK(vkEnumerateInstanceExtensionProperties( + nullptr, &instance_extension_count, nullptr)); + + std::vector instance_extension_properties( + instance_extension_count); + + VK_CHECK(vkEnumerateInstanceExtensionProperties( + nullptr, &instance_extension_count, instance_extension_properties.data())); + + constexpr const char* const requested_instance_extensions[]{ + VK_EXT_DEBUG_REPORT_EXTENSION_NAME, + }; + + for (const auto& requested_instance_extension : requested_instance_extensions) { + for (const auto& extension : instance_extension_properties) { + if (strcmp(requested_instance_extension, extension.extensionName) == 0) { + enabled_instance_extensions.push_back(requested_instance_extension); + break; + } + } + } + } + + constexpr VkApplicationInfo application_info{ + VK_STRUCTURE_TYPE_APPLICATION_INFO, + nullptr, + "PyTorch", + 0, + "PyTorch", + 0, + VK_API_VERSION_1_0, + }; + + const VkInstanceCreateInfo instance_create_info{ + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, + nullptr, + 0u, + &application_info, + static_cast(enabled_instance_layers.size()), + enabled_instance_layers.data(), + static_cast(enabled_instance_extensions.size()), + enabled_instance_extensions.data(), + }; + + VkInstance instance{}; + VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance)); + TORCH_CHECK(instance, "Invalid Vulkan instance!"); + + return instance; +} + +VkDebugReportCallbackEXT create_debug_report_callback( + const VkInstance instance, + const Runtime::Type type) { + if (Runtime::Type::Debug != type) { + return VkDebugReportCallbackEXT{}; + } + + const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{ + VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + nullptr, + VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_WARNING_BIT_EXT | + VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | + VK_DEBUG_REPORT_DEBUG_BIT_EXT, + debug_report_callback_fn, + nullptr, + }; + + const auto vkCreateDebugReportCallbackEXT = + (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr( + instance, "vkCreateDebugReportCallbackEXT"); + + TORCH_CHECK( + vkCreateDebugReportCallbackEXT, + "Could not load vkCreateDebugReportCallbackEXT"); + + VkDebugReportCallbackEXT debug_report_callback{}; + VK_CHECK(vkCreateDebugReportCallbackEXT( + instance, + &debugReportCallbackCreateInfo, + nullptr, + &debug_report_callback)); + + TORCH_CHECK( + debug_report_callback, + "Invalid Vulkan debug report callback!"); + + return debug_report_callback; +} + +std::vector acquire_physical_devices( + const VkInstance instance) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + instance, + "Invalid Vulkan instance!"); + + uint32_t device_count = 0; + VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr)); + + TORCH_CHECK( + device_count > 0, + "Vulkan: Could not find a device with Vulkan support!"); + + std::vector devices(device_count); + VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data())); + + return devices; +} + +VkPhysicalDeviceProperties query_physical_device_properties( + const VkPhysicalDevice physical_device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + VkPhysicalDeviceProperties physical_device_properties{}; + vkGetPhysicalDeviceProperties( + physical_device, + &physical_device_properties); + + return physical_device_properties; +} + +VkPhysicalDeviceMemoryProperties query_physical_device_memory_properties( + const VkPhysicalDevice physical_device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + VkPhysicalDeviceMemoryProperties physical_device_memory_properties{}; + vkGetPhysicalDeviceMemoryProperties( + physical_device, + &physical_device_memory_properties); + + return physical_device_memory_properties; +} + +uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + physical_device, + "Invalid Vulkan physical device!"); + + uint32_t queue_family_count = 0; + vkGetPhysicalDeviceQueueFamilyProperties( + physical_device, &queue_family_count, nullptr); + + TORCH_CHECK( + queue_family_count > 0, + "Vulkan: Invalid number of queue families!"); + + std::vector + queue_families_properties(queue_family_count); + + vkGetPhysicalDeviceQueueFamilyProperties( + physical_device, + &queue_family_count, + queue_families_properties.data()); + + for (uint32_t i = 0; i < queue_families_properties.size(); ++i) { + const VkQueueFamilyProperties& properties = queue_families_properties[i]; + if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) { + return i; + } + } + + TORCH_CHECK( + false, + "Vulkan: Could not find a queue family that supports compute operations!"); +} + +} // namespace + +Runtime::Debug::Debug(const VkInstance instance) + : instance_(instance) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + instance, + "Invalid Vulkan instance!"); +} + +void Runtime::Debug::operator()( + const VkDebugReportCallbackEXT debug_report_callback) const { + if (debug_report_callback) { + const auto vkDestroyDebugReportCallbackEXT = + (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr( + instance_, "vkDestroyDebugReportCallbackEXT"); + + TORCH_CHECK( + vkDestroyDebugReportCallbackEXT, + "Could not load vkDestroyDebugReportCallbackEXT"); + + vkDestroyDebugReportCallbackEXT( + instance_, debug_report_callback, nullptr); + } +} + +Runtime::Runtime(const Type type) + : instance_(create_instance(type), &VK_DELETER(Instance)), + debug_report_callback_( + create_debug_report_callback(instance(), type), + Debug(instance())) { +} + +Adapter Runtime::select(const Selector& selector) { + const std::vector physical_devices = + acquire_physical_devices(instance()); + + for (const VkPhysicalDevice physical_device : physical_devices) { + const Adapter adapter{ + this, + physical_device, + query_physical_device_properties(physical_device), + query_physical_device_memory_properties(physical_device), + query_compute_queue_family_index(physical_device), + }; + + if (selector(adapter)) { + return adapter; + } + } + + TORCH_CHECK( + false, + "Vulkan: no adapter was selected as part of device enumeration!"); +} + +Runtime* initialize() { + static const std::unique_ptr runtime([]() -> Runtime* { +#ifdef USE_VULKAN_WRAPPER + if (!InitVulkan()) { + TORCH_WARN("Vulkan: Wrapper Failed to InitVulkan!"); + return nullptr; + } +#endif + + try { + return new Runtime(Configuration::kRuntime); + } + catch (...) { + return nullptr; + } + }()); + + return runtime.get(); +} + +bool available() { + return initialize(); +} + +Runtime* runtime() { + Runtime* const runtime = initialize(); + TORCH_CHECK( + runtime, + "Vulkan: Backend not available on this platform!" + "Calls to api::runtime() must have been guarded by api::available()."); + + return runtime; +} + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Runtime.h b/aten/src/ATen/native/vulkan/api/Runtime.h new file mode 100644 index 000000000000..766aeb50cabc --- /dev/null +++ b/aten/src/ATen/native/vulkan/api/Runtime.h @@ -0,0 +1,64 @@ +#pragma once + +#include + +namespace at { +namespace native { +namespace vulkan { +namespace api { + +// +// A Vulkan Runtime initializes a Vulkan instance and decouples the concept of +// Vulkan instance initialization from intialization of, and subsequent +// interactions with, Vulkan [physical and logical] devices as a precursor to +// multi-GPU support. The Vulkan Runtime can be queried for available Adapters +// (i.e. physical devices) in the system which in turn can be used for creation +// of a Vulkan Context (i.e. logical devices). All Vulkan tensors in PyTorch +// are associated with a Context to make tensor <-> device affinity explicit. +// + +class Runtime final { + public: + enum class Type { + Debug, + Release, + }; + + explicit Runtime(Type type); + Runtime(const Runtime&) = delete; + Runtime(Runtime&&) = default; + Runtime& operator=(const Runtime&) = delete; + Runtime& operator=(Runtime&&) = default; + ~Runtime() = default; + + inline VkInstance instance() const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(instance_); + return instance_.get(); + } + + typedef std::function Selector; + Adapter select(const Selector& selector); + + private: + class Debug final { + public: + explicit Debug(VkInstance instance); + void operator()(VkDebugReportCallbackEXT debug_report_callback) const; + + private: + VkInstance instance_; + }; + + private: + // Construction and destruction order matters. Do not move members around. + Handle instance_; + Handle debug_report_callback_; +}; + +bool available(); +Runtime* runtime(); + +} // namespace api +} // namespace vulkan +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/vulkan/api/Shader.cpp b/aten/src/ATen/native/vulkan/api/Shader.cpp index bbd3e3464d78..977f915a61d1 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.cpp +++ b/aten/src/ATen/native/vulkan/api/Shader.cpp @@ -9,8 +9,12 @@ namespace native { namespace vulkan { namespace api { -Shader::Layout::Factory::Factory(const VkDevice device) - : device_(device) { + +Shader::Layout::Factory::Factory(const GPU& gpu) + : device_(gpu.device) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + device_, + "Invalid Vulkan device!"); } Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( @@ -25,7 +29,14 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( VkDescriptorSetLayout descriptor_set_layout{}; VK_CHECK(vkCreateDescriptorSetLayout( - device_, &descriptor_set_layout_create_info, nullptr, &descriptor_set_layout)); + device_, + &descriptor_set_layout_create_info, + nullptr, + &descriptor_set_layout)); + + TORCH_CHECK( + descriptor_set_layout, + "Invalid Vulkan descriptor set layout!"); return Handle{ descriptor_set_layout, @@ -35,6 +46,8 @@ Shader::Layout::Factory::Handle Shader::Layout::Factory::operator()( Shader::Descriptor::Descriptor(const char* const glsl) : type(Type::Source) { + TORCH_CHECK(glsl, "Invalid shader source code!"); + shader.source = { glsl, 0u, @@ -43,6 +56,8 @@ Shader::Descriptor::Descriptor(const char* const glsl) Shader::Descriptor::Descriptor(const uint32_t* const code, const uint32_t size) : type(Type::Binary) { + TORCH_CHECK(code && (0u != size), "Invalid shader binary!"); + shader.binary = { code, size, @@ -68,6 +83,10 @@ struct Shader::Factory::Compiler final { } std::vector compile(const char* const source) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + source, + "Invalid shader source code!"); + const shaderc::SpvCompilationResult result = context.CompileGlslToSpv( source, ::strlen(source), @@ -95,8 +114,8 @@ struct Shader::Factory::Compiler final { #endif /* USE_VULKAN_SHADERC_RUNTIME */ -Shader::Factory::Factory(const VkDevice device) - : device_(device), +Shader::Factory::Factory(const GPU& gpu) + : device_(gpu.device), compiler_(new Compiler) { } @@ -139,7 +158,14 @@ typename Shader::Factory::Handle Shader::Factory::operator()( VkShaderModule shader_module{}; VK_CHECK(vkCreateShaderModule( - device_, &shader_module_create_info, nullptr, &shader_module)); + device_, + &shader_module_create_info, + nullptr, + &shader_module)); + + TORCH_CHECK( + shader_module, + "Invalid Vulkan shader module!"); return Handle{ shader_module, diff --git a/aten/src/ATen/native/vulkan/api/Shader.h b/aten/src/ATen/native/vulkan/api/Shader.h index 0fd2fa01614b..ff02b2ba9064 100644 --- a/aten/src/ATen/native/vulkan/api/Shader.h +++ b/aten/src/ATen/native/vulkan/api/Shader.h @@ -32,7 +32,7 @@ namespace api { // and destruct the aforementioned Vulkan objects. // -struct C10_EXPORT Shader final { +struct Shader final { // // Layout // @@ -52,7 +52,7 @@ struct C10_EXPORT Shader final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); typedef Layout::Descriptor Descriptor; typedef VK_DELETER(DescriptorSetLayout) Deleter; @@ -75,8 +75,8 @@ struct C10_EXPORT Shader final { typedef api::Cache Cache; Cache cache; - explicit Layout(const VkDevice device) - : cache(Factory(device)) { + explicit Layout(const GPU& gpu) + : cache(Factory(gpu)) { } } layout; @@ -122,7 +122,7 @@ struct C10_EXPORT Shader final { class Factory final { public: - explicit Factory(VkDevice device); + explicit Factory(const GPU& gpu); Factory(const Factory&) = delete; Factory& operator=(const Factory&) = delete; Factory(Factory&&); @@ -152,9 +152,9 @@ struct C10_EXPORT Shader final { typedef api::Cache Cache; Cache cache; - explicit Shader(const VkDevice device) - : layout(device), - cache(Factory(device)) { + explicit Shader(const GPU& gpu) + : layout(gpu), + cache(Factory(gpu)) { } }; @@ -187,11 +187,11 @@ inline size_t Shader::Layout::Factory::Hasher::operator()( } inline bool operator==( - const Shader::WorkGroup& work_group_1, - const Shader::WorkGroup& work_group_2) { - return (work_group_1.x == work_group_2.x) && - (work_group_1.y == work_group_2.y) && - (work_group_1.z == work_group_2.z); + const Shader::WorkGroup& _1, + const Shader::WorkGroup& _2) { + return (_1.x == _2.x) && + (_1.y == _2.y) && + (_1.z == _2.z); } inline bool operator==( diff --git a/aten/src/ATen/native/vulkan/api/api.h b/aten/src/ATen/native/vulkan/api/api.h index 394f55d7d525..658824e3bf2b 100644 --- a/aten/src/ATen/native/vulkan/api/api.h +++ b/aten/src/ATen/native/vulkan/api/api.h @@ -2,9 +2,11 @@ #include +#include #include #include #include #include #include +#include #include diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp index 1f9225b52770..5ab64d2cb803 100644 --- a/aten/src/ATen/quantized/Quantizer.cpp +++ b/aten/src/ATen/quantized/Quantizer.cpp @@ -77,6 +77,18 @@ QTensorImpl* get_qtensorimpl(const Tensor& self) { return static_cast(self.unsafeGetTensorImpl()); } +int64_t get_sub_byte_tensor_size(int64_t size_bytes, at::ScalarType t) { + int64_t new_size_bytes; + switch(t) { + case at::ScalarType::QUInt4x2: + new_size_bytes = std::ceil(size_bytes * 0.5); + break; + default: + new_size_bytes = size_bytes; + } + return new_size_bytes; +} + inline Tensor new_qtensor( IntArrayRef sizes, const TensorOptions& options, @@ -99,7 +111,9 @@ inline Tensor new_qtensor( TORCH_CHECK( isQIntType(typeMetaToScalarType(dtype)), "ScalarType is not supported in new_qtensor."); - int64_t size_bytes = nelements * dtype.itemsize(); + auto scalar_type = typeMetaToScalarType(dtype); + int64_t size_bytes = get_sub_byte_tensor_size(nelements * dtype.itemsize(), scalar_type); + auto storage = c10::make_intrusive( StorageImpl::use_byte_size_t(), size_bytes, diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index 8bf25c3cac2f..26e9fd9f21fa 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -18,9 +18,7 @@ RecordFunctionHandle next_unique_record_function_handle() { return RecordFunctionHandle(++unique_rf_id); } -// Thread local vector of callbacks, holds pairs (callbacks, unique_id); -// must be sorted in increasing handles order -thread_local RecordFunctionCallbacks sorted_tls_callbacks_; +thread_local RecordFunctionTLS rf_tls_; std::atomic defaultNodeId(-1); @@ -52,13 +50,21 @@ double sample_zero_one() { } // namespace +const RecordFunctionTLS& get_record_function_tls_() { + return rf_tls_; +} + +void set_record_function_tls_(const RecordFunctionTLS& tls) { + rf_tls_ = tls; +} + class CallbackManager { public: CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) { // note: monotonically increasing callbacks_unique_id keeps // sorted_tls_callbacks_ sorted auto handle = next_unique_callback_handle(); - sorted_tls_callbacks_.emplace_back(std::move(cb), handle); + rf_tls_.sorted_tls_callbacks_.emplace_back(std::move(cb), handle); return handle; } @@ -85,7 +91,7 @@ class CallbackManager { } return false; }; - auto found = find_and_remove(sorted_tls_callbacks_); + auto found = find_and_remove(rf_tls_.sorted_tls_callbacks_); if (!found) { found = find_and_remove(sorted_global_callbacks_); } @@ -99,7 +105,7 @@ class CallbackManager { } void clearThreadLocalCallbacks() { - sorted_tls_callbacks_.clear(); + rf_tls_.sorted_tls_callbacks_.clear(); } inline bool hasGlobalCallbacks() const { @@ -107,7 +113,7 @@ class CallbackManager { } inline bool hasThreadLocalCallbacks() const { - return !sorted_tls_callbacks_.empty(); + return !rf_tls_.sorted_tls_callbacks_.empty(); } // init is called by RecordFunction in constructor to @@ -141,7 +147,7 @@ class CallbackManager { ctx_list.resize(num_callbacks); }; - init_handles(rec_fn.sorted_active_tls_handles_, sorted_tls_callbacks_, rec_fn.tls_ctx_); + init_handles(rec_fn.sorted_active_tls_handles_, rf_tls_.sorted_tls_callbacks_, rec_fn.tls_ctx_); init_handles(rec_fn.sorted_active_global_handles_, sorted_global_callbacks_, rec_fn.global_ctx_); rec_fn.active = found_active_cb; rec_fn.needs_inputs = found_needs_inputs; @@ -158,7 +164,7 @@ class CallbackManager { /* is_start */ true, rf); mergeRunCallbacks( - sorted_tls_callbacks_, + rf_tls_.sorted_tls_callbacks_, rf.sorted_active_tls_handles_, rf.tls_ctx_, /* is_start */ true, @@ -174,13 +180,16 @@ class CallbackManager { /* is_start */ false, rf); mergeRunCallbacks( - sorted_tls_callbacks_, + rf_tls_.sorted_tls_callbacks_, rf.sorted_active_tls_handles_, rf.tls_ctx_, /* is_start */ false, rf); } + // Global callbacks; must be sorted in increasing handle order + RecordFunctionCallbacks sorted_global_callbacks_; + private: bool tryRunCallback( const RecordFunctionCallback& rfcb, @@ -235,9 +244,6 @@ class CallbackManager { << "the code after profiler is finished"; } } - - // Global callbacks; must be sorted in increasing handle order - RecordFunctionCallbacks sorted_global_callbacks_; }; namespace { @@ -281,15 +287,15 @@ bool RecordFunctionCallback::shouldRun(RecordScope scope) const { } RecordFunctionCallbacks _getTLSCallbacks() { - return sorted_tls_callbacks_; + return rf_tls_.sorted_tls_callbacks_; } void _setTLSCallbacks(const RecordFunctionCallbacks& callbacks) { // keep the original handles - sorted_tls_callbacks_ = callbacks; + rf_tls_.sorted_tls_callbacks_ = callbacks; std::sort( - sorted_tls_callbacks_.begin(), - sorted_tls_callbacks_.end(), + rf_tls_.sorted_tls_callbacks_.begin(), + rf_tls_.sorted_tls_callbacks_.end(), [](const std::pair& l, const std::pair& r) { return l.second < r.second; @@ -338,16 +344,19 @@ void clearCallbacks() { } bool isRecordFunctionEnabled() { - return tls_record_function_enabled_; + return rf_tls_.tls_record_function_enabled_; } void enableRecordFunction(bool enable) { - tls_record_function_enabled_ = enable; + rf_tls_.tls_record_function_enabled_ = enable; } RecordFunction::RecordFunction(RecordScope scope) : scope_(scope) { - if (hasCallbacks() && isRecordFunctionEnabled()) { - manager().init(*this); + auto* rf_tls_ptr = &rf_tls_; + auto& m = manager(); + if (rf_tls_ptr->tls_record_function_enabled_ && + (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty())) { + m.init(*this); } } diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index 9b4d11ef1d5f..cf839ad4a188 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -13,6 +13,8 @@ namespace at { enum class C10_API_ENUM RecordScope : uint8_t { // c10/ATen ops, autograd nodes FUNCTION = 0, + // Functions/nodes called from the autograd + BACKWARD_FUNCTION, // TorchScript functions, methods TORCHSCRIPT_FUNCTION, // User defined scope (e.g. with record_function()) @@ -115,10 +117,22 @@ struct TORCH_API RecordFunction { // Retrieves the thread_id that this RecordFunction ran start callbacks with. // Useful for writing thread safe end callbacks that may be potentially // executed in a different thread (async ops) - inline uint64_t getStartCallbacksThreadId() const { + inline uint64_t threadId() const { return thread_id_; } + // For backward functions - thread id of the corresponding forward function, + // or zero otherwise; + // used alongside with sequence number to correlate backward functions with + // the forward ones + inline uint64_t forwardThreadId() const { + return fwd_thread_id_; + } + + inline void setForwardThreadId(uint64_t thread_id) { + fwd_thread_id_ = thread_id; + } + inline RecordScope scope() const { return scope_; } @@ -205,6 +219,9 @@ struct TORCH_API RecordFunction { // The logical thread_id that this RecordFunction was created with uint64_t thread_id_ = 0; + // For backward functions - thread id of the the forward function + uint64_t fwd_thread_id_ = 0; + // Unique id for this RecordFunction, used in callbacks to track start // and end of ranges RecordFunctionHandle handle_ {0}; @@ -471,4 +488,16 @@ class TORCH_API DisableRecordFunctionGuard : public RecordFunctionGuard { TORCH_API RecordFunctionCallbacks _getTLSCallbacks(); TORCH_API void _setTLSCallbacks(const RecordFunctionCallbacks& callbacks); +struct TORCH_API RecordFunctionTLS { + // Thread local vector of callbacks, holds pairs (callbacks, unique_id); + // must be sorted in increasing handles order + RecordFunctionCallbacks sorted_tls_callbacks_; + + bool tls_record_function_enabled_ = true; +}; + +TORCH_API const RecordFunctionTLS& get_record_function_tls_(); + +TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls); + } // namespace at diff --git a/aten/src/ATen/templates/BackendSelectRegister.cpp b/aten/src/ATen/templates/BackendSelectRegister.cpp index db7276913201..bcbf25f3117f 100644 --- a/aten/src/ATen/templates/BackendSelectRegister.cpp +++ b/aten/src/ATen/templates/BackendSelectRegister.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include namespace at { diff --git a/aten/src/ATen/templates/PerOpRegistration.cpp b/aten/src/ATen/templates/PerOpRegistration.cpp deleted file mode 100644 index 72ac3d784dad..000000000000 --- a/aten/src/ATen/templates/PerOpRegistration.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// ${generated_comment} - -#include -#include -#include -#include -$extra_headers - -namespace at { - -TORCH_LIBRARY_FRAGMENT_THIS_API_IS_FOR_PER_OP_REGISTRATION_ONLY(aten, m) { - ${function_registrations} -} - -} // namespace at diff --git a/aten/src/ATen/templates/RegistrationDeclarations.h b/aten/src/ATen/templates/RegistrationDeclarations.h new file mode 100644 index 000000000000..5a0f0d0c7b44 --- /dev/null +++ b/aten/src/ATen/templates/RegistrationDeclarations.h @@ -0,0 +1,4 @@ +// This file contains all native_functions that can be registered to +// and the schema string that they should be registered with + +${registration_declarations} diff --git a/aten/src/ATen/templates/TypeDefault.cpp b/aten/src/ATen/templates/TypeDefault.cpp index aa5bb4f0c838..58c80381d340 100644 --- a/aten/src/ATen/templates/TypeDefault.cpp +++ b/aten/src/ATen/templates/TypeDefault.cpp @@ -27,50 +27,45 @@ TORCH_LIBRARY(aten, m) { ${function_registrations}; // String Ops - // Implementations located in torch/csrc/jit/runtime/register_string_ops.cpp - m.def("splitlines(str self, bool keepends=False) -> str[]"); - m.def( - "slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str"); - m.def("isupper(str self) -> bool"); - m.def("islower(str self) -> bool"); - m.def("capitalize(str self) -> str"); - m.def("title(str self) -> str"); - m.def("center(str self, int width, str fillchar=' ') -> str"); - m.def("count(str self, str substr, int start=0, int end=-1) -> int"); - m.def("endswith(str self, str substr, int start=0, int end=-1) -> bool"); - m.def("startswith(str self, str substr, int start=0, int end=-1) -> bool"); - m.def("expandtabs(str self, int tabsize=8) -> str"); - m.def("find(str self, str substr, int start=0, int end=-1) -> int"); - m.def("rfind(str self, str substr, int start=0, int end=-1) -> int"); - m.def("index.str(str self, str substr, int start=0, int end=-1) -> int"); - m.def("rindex(str self, str substr, int start=0, int end=-1) -> int"); - m.def("isidentifier(str self) -> bool"); - m.def("istitle(str self) -> bool"); - m.def("isprintable(str self) -> bool"); - m.def("ljust(str self, int width, str fillchar=' ') -> str"); - m.def("rjust(str self, int width, str fillchar=' ') -> str"); - m.def("zfill(str self, int width) -> str"); - m.def("lstrip(str self, str chars=' \\n\\t\\f\\v') -> str"); - m.def("rstrip(str self, str chars=' \\n\\t\\f\\v') -> str"); - m.def("strip(str self, str chars=' \\n\\t\\f\\v') -> str"); - m.def("replace(str self, str old, str new, int max=-1) -> str"); - m.def("partition(str self, str separator) -> (str, str, str)"); - m.def("rpartition(str self, str separator) -> (str, str, str)"); - m.def("split.str(str self, str? separator=None, int max=-1) -> str[]"); - m.def("rsplit(str self, str separator=' ', int max=-1) -> str[]"); - m.def("join(str self, str[] values) -> str"); - - // Integer Ops - // Implementations located in torch/csrc/jit/runtime/register_prim_ops_c10.cp - m.def("Int.Tensor(Tensor a) -> int"); - m.def("Int.bool(bool a) -> int"); - m.def("Int.float(float a) -> int"); - m.def("Int.Scalar(Scalar a) -> int"); - m.def("Int.str(str a) -> int"); + // Implementations located in torch/csrc/jit/runtime/register_prim_ops.cpp + m.def(TORCH_SELECTIVE_SCHEMA("aten::splitlines(str self, bool keepends=False) -> str[]")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::slice.str(str string, int start, int end=9223372036854775807, int step=1) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::isupper(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::islower(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::capitalize(str self) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::title(str self) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::center(str self, int width, str fillchar=' ') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::count(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::endswith(str self, str substr, int start=0, int end=-1) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::startswith(str self, str substr, int start=0, int end=-1) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::expandtabs(str self, int tabsize=8) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::find(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rfind(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::index.str(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rindex(str self, str substr, int start=0, int end=-1) -> int")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::isidentifier(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::istitle(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::isprintable(str self) -> bool")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::ljust(str self, int width, str fillchar=' ') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rjust(str self, int width, str fillchar=' ') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::zfill(str self, int width) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::lstrip(str self, str chars=' \\n\\t\\f\\v') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rstrip(str self, str chars=' \\n\\t\\f\\v') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::strip(str self, str chars=' \\n\\t\\f\\v') -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::replace(str self, str old, str new, int max=-1) -> str")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::partition(str self, str separator) -> (str, str, str)")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rpartition(str self, str separator) -> (str, str, str)")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::split.str(str self, str? separator=None, int max=-1) -> str[]")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::rsplit(str self, str separator=' ', int max=-1) -> str[]")); + m.def(TORCH_SELECTIVE_SCHEMA("aten::join(str self, str[] values) -> str")); // Distributed Ops // Implementations located in torch/csrc/jit/runtime/register_distributed_ops.cpp m.def("get_gradients(int context_id) -> Dict(Tensor, Tensor)"); } +TORCH_LIBRARY_IMPL(aten, Math, m) { + ${math_function_registrations}; +} + } // namespace at diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index a0b992302084..9f69c9d6ad6f 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -27,6 +27,7 @@ list(APPEND ATen_CPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/extension_backend_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/math_kernel_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/memory_overlapping_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp @@ -78,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp) list(APPEND ATen_MOBILE_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp) list(APPEND ATen_VEC256_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp ) # Caffe2 specific tests diff --git a/aten/src/ATen/test/cpu_caching_allocator_test.cpp b/aten/src/ATen/test/cpu_caching_allocator_test.cpp index 28a9b0476524..cead52f5a7cc 100644 --- a/aten/src/ATen/test/cpu_caching_allocator_test.cpp +++ b/aten/src/ATen/test/cpu_caching_allocator_test.cpp @@ -3,7 +3,7 @@ #include #include -#include +#include TEST(CPUCachingAllocatorTest, check_alloc_free) { c10::CPUCachingAllocator caching_allocator; diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp new file mode 100644 index 000000000000..d3391425e14b --- /dev/null +++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp @@ -0,0 +1,167 @@ +#include + +#include +#include + +at::Tensor run_with_control_flow( + at::Tensor input, + at::Tensor conv_weight, + at::Tensor linear_weight, + bool cond, + std::vector& pointers, + bool record = false, + bool validate = false) { + if (cond) { + input = input * 2; + } + void* input_ptr = input.data_ptr(); + auto conv_out = at::conv2d(input, conv_weight); + void* conv_out_ptr = input.data_ptr(); + auto conv_out_flat = conv_out.view({conv_out.size(0), -1}); + auto output = at::linear(conv_out_flat, linear_weight); + if (record) { + pointers.push_back(input_ptr); + pointers.push_back(conv_out_ptr); + } + if (validate) { + TORCH_CHECK(input_ptr == pointers[0]); + TORCH_CHECK(conv_out_ptr == pointers[1]); + } + return output; +} + +TEST(CPUAllocationPlanTest, with_control_flow) { + at::Tensor a = at::rand({23, 16, 16, 16}); + at::Tensor conv_weight = at::rand({16, 16, 3, 3}); + // output shape + // 23, 16, 14, 14 + // Flattened shape = 23, 3136 + at::Tensor linear_weight = at::rand({32, 3136}); + at::Tensor output; + std::vector pointers; + + auto valid_allocation_plan = [&]() { + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = run_with_control_flow( + a, conv_weight, linear_weight, true, pointers); + } + }; + ASSERT_NO_THROW(valid_allocation_plan()); + + auto validate_allocation_plan = + [&](bool record_mode, bool validation_mode) -> bool { + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = + run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers); + } + bool success{true}; + for (uint64_t i = 0; i < 10; ++i) { + bool validation_success; + { + c10::WithValidateAllocationPlanGuard + validation_guard(&plan, &validation_success); + output = run_with_control_flow( + a, conv_weight, linear_weight, validation_mode, pointers); + } + success = success && validation_success; + } + return success; + }; + ASSERT_FALSE(validate_allocation_plan(false, true)); + ASSERT_FALSE(validate_allocation_plan(true, false)); + ASSERT_TRUE(validate_allocation_plan(true, true)); + ASSERT_TRUE(validate_allocation_plan(false, false)); +} + +TEST(CPUAllocationPlanTest, with_profiling_alloc) { + at::Tensor a = at::rand({23, 16, 16, 16}); + at::Tensor conv_weight = at::rand({16, 16, 3, 3}); + // output shape + // 23, 16, 14, 14 + // Flattened shape = 23, 3136 + at::Tensor linear_weight = at::rand({32, 3136}); + at::Tensor output; + std::vector pointers; + + auto valid_allocation_plan = [&]() { + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = run_with_control_flow( + a, conv_weight, linear_weight, false, pointers); + } + }; + ASSERT_NO_THROW(valid_allocation_plan()); + + auto validate_allocation_plan = + [&](bool record_mode, + bool validation_mode, + bool validate_pointers) { + pointers.clear(); + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = run_with_control_flow( + a, + conv_weight, + linear_weight, + record_mode, + pointers, + false, + false); + } + c10::CPUProfilingAllocator profiling_allocator; + { + c10::WithProfilingAllocatorGuard + profiling_allocator_guard(&profiling_allocator, &plan); + output = run_with_control_flow( + a, + conv_weight, + linear_weight, + validation_mode, + pointers, + validate_pointers, + false); + } + for (uint64_t i = 0; i < 10; ++i) { + { + c10::WithProfilingAllocatorGuard + profiling_allocator_guard(&profiling_allocator, &plan); + output = run_with_control_flow( + a, + conv_weight, + linear_weight, + validation_mode, + pointers, + false, + validate_pointers); + } + } + }; + // When control flow conditions are same between profiling and evaluation + // profiling allocator should not throw. + ASSERT_NO_THROW(validate_allocation_plan(true, true, false)); + ASSERT_NO_THROW(validate_allocation_plan(false, false, false)); + // Furthermore profiling allocator should return the same pointers + // back for the intermediate tensors + ASSERT_NO_THROW(validate_allocation_plan(true, true, true)); + ASSERT_NO_THROW(validate_allocation_plan(false, false, true)); + + // When control flow conditions are different between profiling and evaluation + // profiling allocator should throw. + ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error); + ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error); +} + +int main(int argc, char* argv[]) { +// At the moment caching allocator is only exposed to mobile cpu allocator. +#ifdef C10_MOBILE + ::testing::InitGoogleTest(&argc, argv); + at::manual_seed(42); + return RUN_ALL_TESTS(); +#endif /* C10_Mobile */ +} diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp new file mode 100644 index 000000000000..9a4dfd640c3e --- /dev/null +++ b/aten/src/ATen/test/math_kernel_test.cpp @@ -0,0 +1,40 @@ +#include + +#include + +using namespace at; + +#define ASSERT_ALLCLOSE_TOLERANCES(t1, t2, atol, rtol) \ + ASSERT_TRUE(t1.is_same_size(t2)); \ + ASSERT_TRUE(t1.allclose(t2, atol, rtol)); + +// Ideally we want to test both forward and backward on math kernels but I +// haven't found an easy way to do it. Currently we only test forward here +// and rely on backward tests of each at:: function used in math kernels. +TEST(MathKernelTest, NativeGroupNorm) { + int num_channels = 6; + int N = 2; + int H = 2, W = 2; + int HxW = H * W; + + const auto input = randn({N, num_channels, H, W}); + const auto weight = randn({num_channels}); + const auto bias = randn({num_channels}); + double eps = 1e-05; + for (bool undef_weight: {true, false}) { + for (int num_groups: {3, 6, 1}) { + Tensor undef; + auto out = at::native::native_group_norm( + input, undef_weight ? undef : weight, undef_weight ? undef : bias, + N, num_channels, HxW, num_groups, eps); + auto math_out = at::native::math_group_norm( + input, undef_weight ? undef : weight, undef_weight ? undef : bias, + N, num_channels, HxW, num_groups, eps); + ASSERT_ALLCLOSE_TOLERANCES(std::get<0>(out), std::get<0>(math_out), 1e-4, 1e-6); + ASSERT_ALLCLOSE_TOLERANCES(std::get<1>(out), std::get<1>(math_out), 1e-4, 1e-6); + ASSERT_ALLCLOSE_TOLERANCES(std::get<2>(out), std::get<2>(math_out), 1e-4, 1e-6); + } + } +} + + diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp index 0650e9a3e6b4..55df55f3b58c 100644 --- a/aten/src/ATen/test/thread_init_test.cpp +++ b/aten/src/ATen/test/thread_init_test.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 28c1827485b7..ebf9ffce99d0 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -6,11 +6,6 @@ namespace { -TEST(VulkanAPITest, Context) { - constexpr bool kDebug = true; - ASSERT_NO_THROW(at::native::vulkan::api::Context{kDebug}); -} - } // namespace #endif /* USE_VULKAN_API */ diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index 6a491991a090..a3ed10126b93 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -65,6 +65,7 @@ install(FILES THGenerateComplexTypes.h THGenerateIntTypes.h THGenerateQUInt8Type.h + THGenerateQUInt4x2Type.h THGenerateQInt8Type.h THGenerateQInt32Type.h THGenerateQTypes.h diff --git a/aten/src/TH/THGenerateQTypes.h b/aten/src/TH/THGenerateQTypes.h index ee958b3a3210..611b990f508f 100644 --- a/aten/src/TH/THGenerateQTypes.h +++ b/aten/src/TH/THGenerateQTypes.h @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef THQLocalGenerateManyTypes #undef THQLocalGenerateManyTypes diff --git a/aten/src/TH/THGenerateQUInt4x2Type.h b/aten/src/TH/THGenerateQUInt4x2Type.h new file mode 100644 index 000000000000..4ecea4514359 --- /dev/null +++ b/aten/src/TH/THGenerateQUInt4x2Type.h @@ -0,0 +1,24 @@ +#ifndef TH_GENERIC_FILE +#error "You must define TH_GENERIC_FILE before including THGenerateQUInt4x2Type.h" +#endif + +#define quantized_t c10::quint4x2 +#define scalar_t uint8_t +#define Real QUInt4x2 +#define RealUnderlying Byte +#define THQUANTIZED +#define THQUINT8 +#define TH_REAL_IS_BYTE +#line 1 TH_GENERIC_FILE +#include TH_GENERIC_FILE +#undef scalar_t +#undef quantized_t +#undef Real +#undef RealUnderlying +#undef TH_REAL_IS_BYTE +#undef THQUINT8 +#undef THQUANTIZED + +#ifndef THGenerateManyTypes +#undef TH_GENERIC_FILE +#endif diff --git a/aten/src/TH/generic/THBlas.cpp b/aten/src/TH/generic/THBlas.cpp index fd9fe5e6c233..64bc8106fbb3 100644 --- a/aten/src/TH/generic/THBlas.cpp +++ b/aten/src/TH/generic/THBlas.cpp @@ -14,8 +14,6 @@ TH_EXTERNC void dcopy_(int *n, double *x, int *incx, double *y, int *incy); TH_EXTERNC void scopy_(int *n, float *x, int *incx, float *y, int *incy); TH_EXTERNC void daxpy_(int *n, double *a, double *x, int *incx, double *y, int *incy); TH_EXTERNC void saxpy_(int *n, float *a, float *x, int *incx, float *y, int *incy); -TH_EXTERNC void dger_(int *m, int *n, double *alpha, double *x, int *incx, double *y, int *incy, double *a, int *lda); -TH_EXTERNC void sger_(int *m, int *n, float *alpha, float *x, int *incx, float *y, int *incy, float *a, int *lda); void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy) { @@ -111,51 +109,4 @@ void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y } } -void THBlas_(ger)( - int64_t m, - int64_t n, - scalar_t alpha, - scalar_t *x, - int64_t incx, - scalar_t *y, - int64_t incy, - scalar_t *a, - int64_t lda) -{ - if(n == 1) - lda = m; - -#if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) - if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && - (incx > 0) && (incx <= INT_MAX) && - (incy > 0) && (incy <= INT_MAX) ) - { - THArgCheck(lda >= THMax(1, m), 9, - "lda should be at least max(1, m=%d), but have %d", m, lda); - int i_m = (int)m; - int i_n = (int)n; - int i_lda = (int)lda; - int i_incx = (int)incx; - int i_incy = (int)incy; - -#if defined(TH_REAL_IS_DOUBLE) - dger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda); -#else - sger_(&i_m, &i_n, &alpha, x, &i_incx, y, &i_incy, a, &i_lda); -#endif - return; - } -#endif - { - int64_t i, j; - for(j = 0; j < n; j++) - { - scalar_t *column_ = a+j*lda; - scalar_t z = alpha*y[j*incy]; - for(i = 0; i < m; i++) - column_[i] += z*x[i*incx] ; - } - } -} - #endif diff --git a/aten/src/TH/generic/THBlas.h b/aten/src/TH/generic/THBlas.h index 4d3facea4d06..a70d99969d31 100644 --- a/aten/src/TH/generic/THBlas.h +++ b/aten/src/TH/generic/THBlas.h @@ -7,7 +7,4 @@ TH_API void THBlas_(swap)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int TH_API void THBlas_(copy)(int64_t n, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy); TH_API void THBlas_(axpy)(int64_t n, scalar_t a, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy); -/* Level 2 */ -TH_API void THBlas_(ger)(int64_t m, int64_t n, scalar_t alpha, scalar_t *x, int64_t incx, scalar_t *y, int64_t incy, scalar_t *a, int64_t lda); - #endif diff --git a/aten/src/TH/generic/THStorage.h b/aten/src/TH/generic/THStorage.h index cd419c695ba5..a41991c469c7 100644 --- a/aten/src/TH/generic/THStorage.h +++ b/aten/src/TH/generic/THStorage.h @@ -38,6 +38,7 @@ #define THQUInt8Storage THStorage #define THQInt8Storage THStorage #define THQInt32Storage THStorage +#define THQUInt4x2Storage THStorage #define THComplexFloatStorage THStorage #define THComplexDoubleStorage THStorage diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp index 764220c24673..6a79f3e14c14 100644 --- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp +++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp @@ -216,50 +216,6 @@ static inline int64_t THTensor_(wrapLinearIndex)(int64_t linearIndex, int64_t nu return linearIndex < 0 ? linearIndex + numel : linearIndex; } -void THTensor_(take)(THTensor *r_, THTensor *src, THLongTensor *index) -{ - THTensor_(resizeNd)(r_, index->dim(), THTensor_getSizePtr(index), NULL); - THTensor* dst = THTensor_(newContiguous)(r_); - - index = THLongTensor_newContiguous(index); - int64_t* index_data = THLongTensor_data(index); - ptrdiff_t srcElements = THTensor_(nElement)(src); - scalar_t* src_data = src->data(); - scalar_t* dst_data = dst->data(); - ptrdiff_t nIndices = THLongTensor_nElement(index); - int isContiguous = THTensor_(isContiguous)(src); - - // Exceptions must not be thrown across parallel sections, so we - // record the position of the invalid index and throw the exception after the - // loop. - std::atomic invalidIdxPos(-1); - - at::parallel_for(0, nIndices, TH_OMP_OVERHEAD_THRESHOLD, - [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { - int64_t idx = index_data[i]; - if (idx < srcElements && idx >= -srcElements) { - idx = THTensor_(wrapLinearIndex)(idx, srcElements); - if (isContiguous) { - dst_data[i] = src_data[idx]; - } else { - dst_data[i] = src_data[THTensor_(dataOffset)(src, idx)]; - } - } else { - int64_t tmp = -1; - invalidIdxPos.compare_exchange_strong(tmp, i); - } - } - }); - - if (invalidIdxPos >= 0) { - THTensor_(checkLinearIndex)(index_data[invalidIdxPos], srcElements); - } - - THLongTensor_free(index); - THTensor_(freeCopyTo)(dst, r_); -} - void THTensor_(put)(THTensor *tensor, THLongTensor *index, THTensor *src, int accumulate) { THArgCheck(THLongTensor_nElement(index) == THTensor_(nElement)(src), 3, diff --git a/aten/src/TH/generic/THTensorMath.cpp b/aten/src/TH/generic/THTensorMath.cpp index 2450d58a7b57..eb3b593ac736 100644 --- a/aten/src/TH/generic/THTensorMath.cpp +++ b/aten/src/TH/generic/THTensorMath.cpp @@ -22,76 +22,4 @@ // sense (rather than just having cut the file down the middle, which is // what I did when I split these up originally). - -#if !defined(TH_REAL_IS_BOOL) /* non bool only part */ - -void THTensor_(addr)(THTensor *r_, THTensor *t, THTensor *vec1, THTensor *vec2, scalar_t beta, scalar_t alpha) -{ - if( (THTensor_nDimension(vec1) != 1) || (THTensor_nDimension(vec2) != 1) ) - THError("vector and vector expected, got %dD, %dD tensors", - THTensor_nDimension(vec1), THTensor_nDimension(vec2)); - - if(t->dim() != 2) - THError("expected matrix, got %dD tensor for t", t->dim()); - - auto vec1_size = THTensor_(size)(vec1, 0); - auto vec2_size = THTensor_(size)(vec2, 0); - auto vec1_stride = THTensor_(stride)(vec1, 0); - auto vec2_stride = THTensor_(stride)(vec2, 0); - - if( (t->size(0) != vec1_size) || (t->size(1) != vec2_size) ) { - THDescBuff bt = THTensor_(sizeDesc)(t); - THDescBuff bv1 = THTensor_(sizeDesc)(vec1); - THDescBuff bv2 = THTensor_(sizeDesc)(vec2); - THError("size mismatch, t: %s, vec1: %s, vec2: %s", bt.str, bv1.str, bv2.str); - } - - if(r_ != t) - { - THTensor_(resizeAs)(r_, t); - at::Tensor r__wrap = THTensor_wrap(r_); - at::Tensor t_wrap = THTensor_wrap(t); - at::native::copy_(r__wrap, t_wrap); - } - - if(beta == 0) { - THTensor_wrap(r_).zero_(); - } - else if(beta != 1) - THTensor_(mul)(r_, r_, beta); - - // n == 1 || lda >= max(1, m) - #define LDA_COND(M, N, LDA) ((N) == 1 || (LDA) >= THMax(1, (M))) - - if(r_->stride(0) == 1 && LDA_COND(vec1_size, vec2_size, r_->stride(1))) - { - THBlas_(ger)(vec1_size, vec2_size, - alpha, vec1->data(), vec1_stride, - vec2->data(), vec2_stride, - r_->data(), r_->stride(1)); - } - else if(r_->stride(1) == 1 && LDA_COND(vec2_size, vec1_size, r_->stride(0))) - { - THBlas_(ger)(vec2_size, vec1_size, - alpha, vec2->data(), vec2_stride, - vec1->data(), vec1_stride, - r_->data(), r_->stride(0)); - } - else - { - THTensor *cr = THTensor_(newClone)(r_); - - THBlas_(ger)(vec2_size, vec1_size, - alpha, vec2->data(), vec2_stride, - vec1->data(), vec1_stride, - cr->data(), cr->stride(0)); - - THTensor_(freeCopyTo)(cr, r_); - } - - #undef LDA_COND -} - -#endif /* !defined(TH_REAL_IS_BOOL) */ - #endif /* TH_GENERIC_FILE */ diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h index 18ccaeb6eb80..1d0daf1206de 100644 --- a/aten/src/TH/generic/THTensorMath.h +++ b/aten/src/TH/generic/THTensorMath.h @@ -14,8 +14,6 @@ TH_API void THTensor_(maskedCopyBool)(THTensor *tensor, THBoolTensor *mask, THTe TH_API ptrdiff_t THTensor_(numel)(THTensor *t); -TH_API void THTensor_(addr)(THTensor *r_, THTensor *t, THTensor *vec1, THTensor *vec2, scalar_t beta, scalar_t alpha); - #if !defined(TH_REAL_IS_BOOL) TH_API void THTensor_(mul)(THTensor *r_, THTensor *t, scalar_t value); #endif diff --git a/aten/src/THC/THCBlas.cu b/aten/src/THC/THCBlas.cu index fe906ce66fa3..859d904a582b 100644 --- a/aten/src/THC/THCBlas.cu +++ b/aten/src/THC/THCBlas.cu @@ -107,30 +107,9 @@ void adjustLdLevel3(char transa, char transb, int64_t m, int64_t n, int64_t k, i } -// Check https://github.com/pytorch/pytorch/issues/22078 -// for information about the bug. We don't know the exact conditions that trigger it, -// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a -// necessary condition. -static void checkCuda90Bug(int i_m, int i_n, int i_k) -{ -#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000 - static std::once_flag alreadyWarned; - const int LIMIT = 1 << 21; - if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) { - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - if (prop->major == 5 || prop->major == 6) { - std::call_once(alreadyWarned, []() { - TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later."); - }); - } - } -#endif -} - /* Level 3 */ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, float *a, int64_t lda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { - checkCuda90Bug((int)m, (int)n, (int)k); at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } @@ -141,23 +120,19 @@ void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t m, int6 void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, at::Half *a, int64_t lda, at::Half *b, int64_t ldb, at::Half beta, at::Half *c, int64_t ldc) { - checkCuda90Bug((int)m, (int)n, (int)k); at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -#ifdef __HIP_PLATFORM_HCC__ void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc) { at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -#endif void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { at::cuda::blas::gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -#if CUDA_VERSION >= 9010 || defined __HIP_PLATFORM_HCC__ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::Half alpha, const at::Half *a, int64_t lda, int64_t strideA, const at::Half *b, int64_t ldb, int64_t strideB, at::Half beta, at::Half *c, int64_t ldc, int64_t strideC, int64_t batchCount) @@ -205,13 +180,12 @@ void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, i #endif // CUDA_VERSION < 11000 #endif // __HIP_PLATFORM_HCC__ } -#endif // CUDA_VERSION or __HIP_PLATFORM_HCC__ -#ifdef __HIP_PLATFORM_HCC__ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount) { + at::globalContext().alertCuBLASConfigNotDeterministic(); if( (m >= INT_MAX) || (n >= INT_MAX) || (k >= INT_MAX) || (lda >= INT_MAX) || (ldb >= INT_MAX) || (ldc >= INT_MAX) || (batchCount >= INT_MAX) ) { @@ -219,6 +193,7 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i "with the bound [val] <= %d", INT_MAX); } + adjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); cublasOperation_t opa = convertTransToCublasOperation(transa); cublasOperation_t opb = convertTransToCublasOperation(transb); @@ -226,15 +201,30 @@ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, i cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); float fAlpha = alpha; float fBeta = beta; + +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + if (prop->major < 8) { + TORCH_CHECK(false, "BFloat16 gemm in CUDA requires Ampere or later GPU"); + } + THCublasCheck(cublasGemmStridedBatchedEx(handle, + opa, opb, (int)m, (int)n, (int)k, + (void*)&fAlpha, a, CUDA_R_16BF, (int)lda, strideA, + b, CUDA_R_16BF, (int)ldb, strideB, + (void*)&fBeta, c, CUDA_R_16BF, (int)ldc, strideC, + (int)batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +#elif defined(__HIP_PLATFORM_HCC__) THCublasCheck(rocblas_gemm_strided_batched_ex(handle, opa, opb, (int)m, (int)n, (int)k, - (void*)&fAlpha, a, rocblas_datatype_bf16_r, (int)lda, strideA, - b, rocblas_datatype_bf16_r, (int)ldb, strideB, - (void*)&fBeta, c, rocblas_datatype_bf16_r, (int)ldc, strideC, - c, rocblas_datatype_bf16_r, (int)ldc, strideC, - (int) batchCount, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, - 0, 0, NULL, NULL)); + (void*)&fAlpha, a, rocblas_datatype_bf16_r, (int)lda, strideA, + b, rocblas_datatype_bf16_r, (int)ldb, strideB, + (void*)&fBeta, c, rocblas_datatype_bf16_r, (int)ldc, strideC, + c, rocblas_datatype_bf16_r, (int)ldc, strideC, + (int) batchCount, rocblas_datatype_f32_r, rocblas_gemm_algo_standard, + 0, 0, NULL, NULL)); +#else + TORCH_CHECK(false, "THCudaBlas_BgemmStridedBatched is only available on CUDA_VERSION >= 11"); +#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 } -#endif // __HIP_PLATFORM_HCC__ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, @@ -270,7 +260,6 @@ void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t #endif } -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB, float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount) @@ -294,7 +283,6 @@ void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, i &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC, (int)batchCount)); } -#endif void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb, @@ -330,7 +318,6 @@ void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t #endif } -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB, double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount) @@ -353,5 +340,3 @@ void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, i &alpha, a, (int)lda, strideA, b, (int)ldb, strideB, &beta, c, (int)ldc, strideC, (int)batchCount)); } -#endif - diff --git a/aten/src/THC/THCBlas.h b/aten/src/THC/THCBlas.h index cff3180a974a..4078363eb888 100644 --- a/aten/src/THC/THCBlas.h +++ b/aten/src/THC/THCBlas.h @@ -14,9 +14,8 @@ THC_API void THCudaBlas_Sgemm(THCState *state, char transa, char transb, int64_t THC_API void THCudaBlas_Dgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, double *a, int64_t lda, double *b, int64_t ldb, double beta, double *c, int64_t ldc); THC_API void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, THHalf *a, int64_t lda, THHalf *b, int64_t ldb, THHalf beta, THHalf *c, int64_t ldc); -#ifdef __HIP_PLATFORM_HCC__ + THC_API void THCudaBlas_Bgemm(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, at::BFloat16 *a, int64_t lda, at::BFloat16 *b, int64_t ldb, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc); -#endif THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a[], int64_t lda, const float *b[], int64_t ldb, @@ -24,25 +23,19 @@ THC_API void THCudaBlas_SgemmBatched(THCState *state, char transa, char transb, THC_API void THCudaBlas_DgemmBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a[], int64_t lda, const double *b[], int64_t ldb, double beta, double *c[], int64_t ldc, int64_t batchCount); -#if CUDA_VERSION >= 8000 || defined __HIP_PLATFORM_HCC__ THC_API void THCudaBlas_SgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const float *a, int64_t lda, int64_t strideA, const float *b, int64_t ldb, int64_t strideB, float beta, float *c, int64_t ldc, int64_t strideC, int64_t batchCount); THC_API void THCudaBlas_DgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, double alpha, const double *a, int64_t lda, int64_t strideA, const double *b, int64_t ldb, int64_t strideB, double beta, double *c, int64_t ldc, int64_t strideC, int64_t batchCount); -#endif -#if CUDA_VERSION >= 9010 || defined(__HIP_PLATFORM_HCC__) void THCudaBlas_HgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, THHalf alpha, const THHalf *a, int64_t lda, int64_t strideA, const THHalf *b, int64_t ldb, int64_t strideB, THHalf beta, THHalf *c, int64_t ldc, int64_t strideC, int64_t batchCount); -#endif -#ifdef __HIP_PLATFORM_HCC__ void THCudaBlas_BgemmStridedBatched(THCState *state, char transa, char transb, int64_t m, int64_t n, int64_t k, at::BFloat16 alpha, const at::BFloat16 *a, int64_t lda, int64_t strideA, const at::BFloat16 *b, int64_t ldb, int64_t strideB, at::BFloat16 beta, at::BFloat16 *c, int64_t ldc, int64_t strideC, int64_t batchCount); -#endif #endif diff --git a/aten/src/THC/THCDeviceUtils.cuh b/aten/src/THC/THCDeviceUtils.cuh index 171488d91214..5bd751a4921f 100644 --- a/aten/src/THC/THCDeviceUtils.cuh +++ b/aten/src/THC/THCDeviceUtils.cuh @@ -7,6 +7,8 @@ #include #endif +#include + /* The largest consecutive integer representable in float32 (2^24) */ #define FLOAT32_MAX_CONSECUTIVE_INT 16777216.0f @@ -32,7 +34,7 @@ __host__ __device__ __forceinline__ T THCRoundUp(T a, T b) { */ template __device__ __forceinline__ T doLdg(const T* p) { -#if __CUDA_ARCH__ >= 350 +#ifndef __HIP_PLATFORM_HCC__ return __ldg(p); #else return *p; diff --git a/aten/src/THC/THCTensorIndex.cu b/aten/src/THC/THCTensorIndex.cu index 0287f31f658e..dfd3a510e6e1 100644 --- a/aten/src/THC/THCTensorIndex.cu +++ b/aten/src/THC/THCTensorIndex.cu @@ -218,20 +218,6 @@ struct WrapIndexOp { int64_t size; }; -template -struct TensorTakeOp { - TensorTakeOp(TensorInfo info, IndexType numel, int64_t*, int64_t*) - : info(info), numel(numel) {} - - __device__ __forceinline__ void operator()(T* out, int64_t* index) { - auto offset = indexToOffset(info, *index, numel); - *out = info.data[offset]; - } - - const TensorInfo info; - IndexType numel; -}; - template struct TensorPutOp { TensorPutOp(TensorInfo info, IndexType numel, int64_t*, int64_t*) diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index a6c621c8ef15..07303fa47096 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -220,21 +220,6 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT #undef LARGE_INDEX } -void THCTensor_(take)(THCState *state, THCTensor *dst, THCTensor *src, THCudaLongTensor *index) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, dst, src)); - THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, index)); - - THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, src) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(THCTensor_(nDimensionLegacyNoScalars)(state, dst) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(THCudaLongTensor_nDimensionLegacyNoScalars(state, index) <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); - THArgCheck(!(THCTensor_(numel)(state, src) == 0 && THCudaLongTensor_numel(state, index) != 0), 2, - "tried to take from an empty tensor"); - - THCTensor_(resizeNd)(state, dst, index->dim(), THTensor_getSizePtr(index), NULL); - dispatchTakePut(state, src, dst, index); -} - static void THCTensor_(sort_indices)(THCState *state, THCudaLongTensor *index, THCTensor *src) { THCThrustAllocator thrustAlloc(state); diff --git a/aten/src/THC/generic/THCTensorMathBlas.cu b/aten/src/THC/generic/THCTensorMathBlas.cu index 3158e0e267ed..a5d159a9cace 100644 --- a/aten/src/THC/generic/THCTensorMathBlas.cu +++ b/aten/src/THC/generic/THCTensorMathBlas.cu @@ -281,7 +281,7 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t, #endif //CUDA_VERSION #elif defined(THC_REAL_IS_BFLOAT16) -#if defined(__HIP_PLATFORM_HCC__) +#if defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000 THCudaBlas_BgemmStridedBatched( state, transpose_batch1, @@ -310,15 +310,13 @@ void THCTensor_(baddbmm)(THCState *state, THCTensor *result, THCTensor *t, THCTensor_(freeCopyTo)(state, result_, result); } -#if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) +#if defined(THC_REAL_IS_BFLOAT16) && !(defined(__HIP_PLATFORM_HCC__) || defined(CUDA_VERSION) && CUDA_VERSION >= 11000) // To avoid "variable was set but never used" warning [&transpose_batch1, &transpose_batch2, &lda, &ldb, &ldc]{}(); TORCH_CHECK(false, "BgemmStridedBatched is not supported with at::BFloat16 type"); #endif } -#if !defined(THC_REAL_IS_BFLOAT16) || defined(__HIP_PLATFORM_HCC__) at::namedinference::propagate_names_if_nonempty(result, maybe_outnames); -#endif #else ERROR_ONLY_FP_TYPES("baddbmm"); diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu index a50f5e8f51ac..357b3f2e22f3 100644 --- a/aten/src/THC/generic/THCTensorTopK.cu +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -9,9 +9,6 @@ void THCTensor_(topk)(THCState* state, THCudaLongTensor *indices, THCTensor *input_, int64_t k, int dim, int dir, int sorted) { - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "topk not suppported with BFloat16"); - #else THAssert(topK != NULL && indices != NULL && input_ != NULL); THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, topK, indices, input_)); dim = at::maybe_wrap_dim(dim, input_); @@ -186,7 +183,6 @@ void THCTensor_(topk)(THCState* state, THCudaLongTensor_free(state, input); THCudaCheck(cudaGetLastError()); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } #endif // THC_GENERIC_FILE diff --git a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu index 535c43636af0..44616bf4cf60 100644 --- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu +++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu @@ -114,9 +114,6 @@ void THNN_(SpatialConvolutionMM_updateOutput)( int kW, int kH, int dW, int dH, int padW, int padH) { - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "SpatialConvolutionMM_updateOutput not suppported with BFloat16"); - #else THCUNN_assertSameGPU(state, 5, input, output, weight, columns, ones); if (bias) { THCUNN_assertSameGPU(state, 2, weight, bias); @@ -267,7 +264,6 @@ void THNN_(SpatialConvolutionMM_updateOutput)( THCTensor_(free)(state, input); THCTensor_(free)(state, weight); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } void THNN_(SpatialConvolutionMM_updateGradInput)( @@ -281,10 +277,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( int kW, int kH, int dW, int dH, int padW, int padH) { - - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "SpatialConvolutionMM_updateGradInput not suppported with BFloat16"); - #else THCUNN_assertSameGPU(state, 5, input, gradOutput, weight, gradColumns, gradInput); weight = THNN_(newViewWeightMM2d)(state, weight); @@ -380,7 +372,6 @@ void THNN_(SpatialConvolutionMM_updateGradInput)( THCTensor_(free)(state, input); THCTensor_(free)(state, gradOutput); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } void THNN_(SpatialConvolutionMM_accGradParameters)( @@ -395,10 +386,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)( int dW, int dH, int padW, int padH, accreal scale_) { - - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "SpatialConvolutionMM_updateGradParameters not suppported with BFloat16"); - #else scalar_t scale = ScalarConvert::to(scale_); THCUNN_assertSameGPU(state, 5, input, gradOutput, gradWeight, gradBias, columns, ones); if (gradWeight) { @@ -554,7 +541,6 @@ void THNN_(SpatialConvolutionMM_accGradParameters)( THCTensor_(free)(state, input); THCTensor_(free)(state, gradOutput); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } #endif diff --git a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu index 18d8da647d15..53eff031a822 100644 --- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu +++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu @@ -13,9 +13,6 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( int padW, int padH, int dilationW, int dilationH) { - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "SpatialDepthwiseConvolution_updateOutput not suppported with BFloat16"); - #else THCUNN_assertSameGPU(state, 3, input, output, weight); // Only handle 4D Input Tensors for now @@ -94,7 +91,6 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)( THCTensor_(free)(state, input); THCTensor_(free)(state, weight); if (bias) THCTensor_(free)(state, bias); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } void THNN_(SpatialDepthwiseConvolution_updateGradInput)( @@ -108,9 +104,6 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)( int padW, int padH, int dilationW, int dilationH) { - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "SpatialDepthwiseConvolution_updateGradInput not suppported with BFloat16"); - #else THCUNN_assertSameGPU(state, 3, gradOutput, gradInput, weight); // Only handle 4D Input Tensors for now @@ -203,7 +196,6 @@ void THNN_(SpatialDepthwiseConvolution_updateGradInput)( THCTensor_(free)(state, weight); THCTensor_(free)(state, gradOutput); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } void THNN_(SpatialDepthwiseConvolution_accGradParameters)( @@ -216,9 +208,6 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)( int padW, int padH, int dilationW, int dilationH) { - #if defined(THC_REAL_IS_BFLOAT16) && !defined(__HIP_PLATFORM_HCC__) - TORCH_CHECK(false, "SpatialDepthwiseConvolution_accGradParameters not suppported with BFloat16"); - #else THCUNN_assertSameGPU(state, 3, input, gradOutput, gradWeight); // Only handle 4D Input Tensors for now @@ -271,7 +260,6 @@ void THNN_(SpatialDepthwiseConvolution_accGradParameters)( THCudaCheck(cudaGetLastError()); THCTensor_(free)(state, gradOutput); - #endif // THC_REAL_IS_BFLOAT16 && !__HIP_PLATFORM_HCC__ } #endif diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py index 620c19a13cf1..5b85f87291dc 100644 --- a/benchmarks/fastrnns/fuser.py +++ b/benchmarks/fastrnns/fuser.py @@ -1,12 +1,12 @@ import torch def set_fuser(fuser_name, executor_name): - assert fuser_name in ['te', 'old', 'none'] + assert fuser_name in ['te', 'old', 'none', 'default'] if fuser_name == 'te': torch._C._jit_set_profiling_executor(True) torch._C._jit_set_profiling_mode(True) torch._C._jit_set_bailout_depth(20) - torch._C._jit_set_num_profiled_runs(2) + torch._C._jit_set_num_profiled_runs(1) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_override_can_fuse_on_gpu(True) torch._C._jit_set_texpr_fuser_enabled(True) @@ -21,6 +21,8 @@ def set_fuser(fuser_name, executor_name): torch._C._jit_override_can_fuse_on_gpu(False) torch._C._jit_override_can_fuse_on_cpu(False) torch._C._jit_set_texpr_fuser_enabled(False) + elif fuser_name == 'default': + pass # --executor overrides settings of --fuser if executor_name == 'profiling': @@ -34,3 +36,5 @@ def set_fuser(fuser_name, executor_name): elif executor_name == 'legacy': torch._C._jit_set_profiling_executor(False) torch._C._jit_set_profiling_mode(False) + elif executor_name == 'default': + pass diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py index 4d927d73bfc0..1c5a905f2b75 100644 --- a/benchmarks/operator_benchmark/benchmark_pytorch.py +++ b/benchmarks/operator_benchmark/benchmark_pytorch.py @@ -1,7 +1,7 @@ import time import json import torch -import torch.utils.cpp_extension as cpp_extension # noqa +import cpp_extension # noqa """PyTorch performance microbenchmarks. @@ -149,14 +149,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync): for _ in range(num_runs): start_time = time.time() self.output = self.op_bench.forward() - if cuda_sync: + if cuda_sync: torch.cuda.synchronize(torch.cuda.current_device()) end_time = time.time() self.time_series.append((end_time - start_time) * 1e3) else: for _ in range(num_runs): self.output = self.op_bench.forward() - if cuda_sync: + if cuda_sync: torch.cuda.synchronize(torch.cuda.current_device()) def _output_mean(self): diff --git a/benchmarks/operator_benchmark/pt/unary_test.py b/benchmarks/operator_benchmark/pt/unary_test.py index 4a8a7865330b..1391283b1e10 100644 --- a/benchmarks/operator_benchmark/pt/unary_test.py +++ b/benchmarks/operator_benchmark/pt/unary_test.py @@ -91,6 +91,7 @@ def forward(self): ['sigmoid', torch.sigmoid], ['sigmoid_', torch.sigmoid_], ['sign', torch.sign], + ['sgn', torch.sgn], ['sin', torch.sin], ['sin_', torch.sin_], ['sinh', torch.sinh], diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py index 616d1078ee7d..6b187b03522e 100644 --- a/benchmarks/profiler_benchmark/profiler_bench.py +++ b/benchmarks/profiler_benchmark/profiler_bench.py @@ -1,33 +1,23 @@ -from functools import partial -import itertools +import argparse import statistics +import sys import timeit import torch -TENSOR_SIZES = [1, 32, 128, 256, 512] -INTERNAL_ITER = 256 -PARALLEL_TASKS_NUM = 4 -N = 100 +from torch.utils._benchmark import Timer +PARALLEL_TASKS_NUM = 4 +INTERNAL_ITER = None def loop_workload(x): for i in range(INTERNAL_ITER): x = torch.mm(x, x) return x -traced_loop_workload = None -def run_profiler_benchmark_loop(input_x, use_cuda, profiling_enabled): - if profiling_enabled: - with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof: - traced_loop_workload(input_x) - else: - traced_loop_workload(input_x) - -def parallel_task(x): - for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)): - x = torch.mm(x, x) - return x - def parallel_workload(x): + def parallel_task(x): + for i in range(int(INTERNAL_ITER / PARALLEL_TASKS_NUM)): + x = torch.mm(x, x) + return x futs = [] for i in range(PARALLEL_TASKS_NUM): futs.append(torch.jit._fork(parallel_task, x)) @@ -35,50 +25,85 @@ def parallel_workload(x): torch.jit._wait(futs[i]) return x -traced_parallel_workload = None -def run_profiler_benchmark_parallel(input_x, use_cuda, profiling_enabled): - if profiling_enabled: - with torch.autograd.profiler.profile(use_cuda=use_cuda) as prof: - traced_parallel_workload(input_x) - else: - traced_parallel_workload(input_x) if __name__ == '__main__': - for workload_name in ["loop", "parallel"]: - print("Payload: {}; {} iterations, N = {}\n".format( - workload_name, INTERNAL_ITER, N)) - for params in itertools.product([False, True], TENSOR_SIZES, [False, True]): - use_cuda = params[0] - profiling_tensor_size = params[1] - profiling_enabled = params[2] - - if (use_cuda and not torch.cuda.is_available()): - continue - - print("Profiling {}, tensor size {}x{}, use cuda: {}".format( - "enabled" if profiling_enabled else "disabled", - profiling_tensor_size, profiling_tensor_size, use_cuda)) - - input_x = torch.rand(profiling_tensor_size, profiling_tensor_size) - if use_cuda: - input_x = input_x.cuda() - workload = None - if workload_name == "loop": - workload = partial( - run_profiler_benchmark_loop, input_x, use_cuda, profiling_enabled) - traced_loop_workload = torch.jit.trace(loop_workload, input_x) - elif workload_name == "parallel": - workload = partial( - run_profiler_benchmark_parallel, input_x, use_cuda, profiling_enabled) - traced_parallel_workload = torch.jit.trace( - parallel_workload, input_x) - - runtimes = timeit.repeat(workload, repeat=N, number=1) + torch._C._set_graph_executor_optimize(False) + parser = argparse.ArgumentParser( + description='Profiler benchmark') + + parser.add_argument('--with_cuda', action='store_true') + parser.add_argument('--with_stack', action='store_true') + parser.add_argument('--use_script', action='store_true') + parser.add_argument('--profiling_tensor_size', default=1, type=int) + parser.add_argument('--workload', default='loop', type=str) + parser.add_argument('--internal_iter', default=256, type=int) + parser.add_argument('--n', default=100, type=int) + parser.add_argument('--use_timer', action='store_true') + parser.add_argument('--timer_min_run_time', default=100, type=int) + + args = parser.parse_args() + + if args.with_cuda and not torch.cuda.is_available(): + print("No CUDA available") + sys.exit() + + print("Payload: {}; {} iterations, N = {}\n".format( + args.workload, args.internal_iter, args.n)) + INTERNAL_ITER = args.internal_iter + + for profiling_enabled in [False, True]: + print("Profiling {}, tensor size {}x{}, use cuda: {}, with stacks: {}, use script: {}".format( + "enabled" if profiling_enabled else "disabled", + args.profiling_tensor_size, + args.profiling_tensor_size, + args.with_cuda, + args.with_stack, + args.use_script)) + + input_x = torch.rand( + args.profiling_tensor_size, + args.profiling_tensor_size) + + if args.with_cuda: + input_x = input_x.cuda() + + workload = None + assert args.workload in ["loop", "parallel"] + if args.workload == "loop": + workload = loop_workload + else: + workload = parallel_workload + + if args.use_script: + traced_workload = torch.jit.trace(workload, (input_x,)) + workload = traced_workload + + if profiling_enabled: + def payload(): + x = None + with torch.autograd.profiler.profile( + use_cuda=args.with_cuda, + with_stack=args.with_stack) as prof: + x = workload(input_x) + return x + else: + def payload(): + return workload(input_x) + + if args.use_timer: + t = Timer( + "payload()", + globals={"payload": payload}, + timer=timeit.default_timer, + ).blocked_autorange(min_run_time=args.timer_min_run_time) + print(t) + else: + runtimes = timeit.repeat(payload, repeat=args.n, number=1) avg_time = statistics.mean(runtimes) * 1000.0 stddev_time = statistics.stdev(runtimes) * 1000.0 print("\tavg. time: {:.3f} ms, stddev: {:.3f} ms".format( avg_time, stddev_time)) - if workload_name == "loop": + if args.workload == "loop": print("\ttime per iteration: {:.3f} ms".format( - avg_time / INTERNAL_ITER)) - print() + avg_time / args.internal_iter)) + print() diff --git a/benchmarks/record_function_benchmark/record_function_bench.py b/benchmarks/record_function_benchmark/record_function_bench.py index ddd8243ebf0a..830328247bb5 100644 --- a/benchmarks/record_function_benchmark/record_function_bench.py +++ b/benchmarks/record_function_benchmark/record_function_bench.py @@ -1,7 +1,7 @@ import argparse import sys import torch -import torch.utils._benchmark as benchmark_utils +import torch.utils.benchmark as benchmark_utils try: diff --git a/benchmarks/static_runtime/CMakeLists.txt b/benchmarks/static_runtime/CMakeLists.txt index 6191150dc61b..0a263c2a5a91 100644 --- a/benchmarks/static_runtime/CMakeLists.txt +++ b/benchmarks/static_runtime/CMakeLists.txt @@ -1,3 +1,7 @@ -list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc) list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc) +list(APPEND STATIC_RUNTIME_BENCHMARK_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt_bench.cc) set(STATIC_RUNTIME_BENCHMARK_SRCS ${STATIC_RUNTIME_BENCHMARK_SRCS} PARENT_SCOPE) + +list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/deep_wide_pt.cc) +list(APPEND STATIC_RUNTIME_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/test_static_runtime.cc) +set(STATIC_RUNTIME_TEST_SRCS ${STATIC_RUNTIME_TEST_SRCS} PARENT_SCOPE) diff --git a/benchmarks/static_runtime/deep_wide_pt_bench.cc b/benchmarks/static_runtime/deep_wide_pt_bench.cc index ef960d28d7eb..21c2923f8301 100644 --- a/benchmarks/static_runtime/deep_wide_pt_bench.cc +++ b/benchmarks/static_runtime/deep_wide_pt_bench.cc @@ -60,7 +60,8 @@ static void BM_deep_wide_jit_profiling_executor(benchmark::State& state) { static void BM_deep_wide_static(benchmark::State& state) { auto mod = getDeepAndWideSciptModel(); - torch::jit::StaticRuntime runtime(mod); + auto g = torch::jit::PrepareForStaticRuntime(mod); + torch::jit::StaticRuntime runtime(g); const int batch_size = state.range(0); auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size}); @@ -75,6 +76,28 @@ static void BM_deep_wide_static(benchmark::State& state) { } } +const std::shared_ptr& getStaticGraph() { + static const std::shared_ptr g = + torch::jit::PrepareForStaticRuntime(getDeepAndWideSciptModel()); + return g; +} + +static void BM_deep_wide_static_threaded(benchmark::State& state) { + auto g = getStaticGraph(); + torch::jit::StaticRuntime runtime(g); + + const int batch_size = 1; // state.range(0); + auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size}); + auto user_emb = torch::randn({batch_size, 1, embedding_size}); + auto wide = torch::randn({batch_size, num_features}); + + std::vector inputs({ad_emb_packed, user_emb, wide}); + + for (auto _ : state) { + runtime.run(inputs); + } +} + BENCHMARK(BM_deep_wide_base)->RangeMultiplier(8)->Ranges({{1, 20}}); BENCHMARK(BM_deep_wide_jit_graph_executor) @@ -86,5 +109,6 @@ BENCHMARK(BM_deep_wide_jit_profiling_executor) ->Ranges({{1, 20}}); BENCHMARK(BM_deep_wide_static)->RangeMultiplier(8)->Ranges({{1, 20}}); +BENCHMARK(BM_deep_wide_static_threaded)->Threads(8); BENCHMARK_MAIN(); diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 3ad0956ced73..172073705ea1 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -14,7 +14,85 @@ TEST(StaticRuntime, TrivialModel) { // run static runtime std::vector input_tensors({a, b, c}); - torch::jit::StaticRuntime runtime(mod); + auto g = torch::jit::PrepareForStaticRuntime(mod); + torch::jit::StaticRuntime runtime(g); at::Tensor output_2 = runtime.run(input_tensors)[0]; EXPECT_TRUE(output_1.equal(output_2)); } + +TEST(StaticRuntime, DeepWide) { + const int embedding_size = 32; + const int num_features = 50; + torch::jit::Module mod = getDeepAndWideSciptModel(); + auto g = torch::jit::PrepareForStaticRuntime(mod); + torch::jit::StaticRuntime runtime(g); + + for (int batch_size : {1, 8, 32}) { + for (int i = 0; i < 5; ++i) { + auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size}); + auto user_emb = torch::randn({batch_size, 1, embedding_size}); + auto wide = torch::randn({batch_size, num_features}); + + // run jit graph executor + std::vector inputs({ad_emb_packed, user_emb, wide}); + at::Tensor output_1 = mod.forward(inputs).toTensor(); + + // run static runtime + std::vector input_tensors({ad_emb_packed, user_emb, wide}); + at::Tensor output_2 = runtime.run(input_tensors)[0]; + EXPECT_TRUE(output_1.equal(output_2)); + } + } +} + +TEST(StaticRuntime, KWargsAPI_1) { + const int embedding_size = 32; + const int num_features = 50; + auto module = getDeepAndWideSciptModel(); + torch::jit::StaticRuntime runtime(module); + + for (int batch_size : {1, 8, 32}) { + for (int i = 0; i < 5; ++i) { + auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size}); + auto user_emb = torch::randn({batch_size, 1, embedding_size}); + auto wide = torch::randn({batch_size, num_features}); + + // run jit graph executor + std::vector inputs({ad_emb_packed, user_emb, wide}); + at::Tensor output_1 = module.forward(inputs).toTensor(); + + // run static runtime + at::Tensor output_2 = runtime.run(inputs, {}).toTensor(); + EXPECT_TRUE(output_1.equal(output_2)); + } + } +} + +TEST(StaticRuntime, KWargsAPI_2) { + const int embedding_size = 32; + const int num_features = 50; + auto module = getDeepAndWideSciptModel(); + auto g = torch::jit::PrepareForStaticRuntime(module); + torch::jit::StaticRuntime runtime(module); + + for (int batch_size : {1, 8, 32}) { + for (int i = 0; i < 5; ++i) { + auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size}); + auto user_emb = torch::randn({batch_size, 1, embedding_size}); + auto wide = torch::randn({batch_size, num_features}); + + // run jit graph executor + std::vector args({ad_emb_packed, user_emb, wide}); + at::Tensor output_1 = module.forward(args).toTensor(); + + std::unordered_map kwargs( + {{"ad_emb_packed", ad_emb_packed}, + {"user_emb", user_emb}, + {"wide", wide}}); + + // run static runtime + at::Tensor output_2 = runtime.run({}, kwargs).toTensor(); + EXPECT_TRUE(output_1.equal(output_2)); + } + } +} diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index a7e3383b97f4..d924003b9270 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -9,8 +9,8 @@ C10_DEFINE_int(iter, 100, "Number of iterations"); C10_DEFINE_int(warmup_iter, 10, "Number of warmup iterations"); -C10_DEFINE_int(rec_fn_iter, 10e6, - "Number of iterations for the pure RecordFunction benchmark"); +C10_DEFINE_int(sampled_iter, 10e6, + "Number of iterations for the sampled observer benchmark"); namespace { const int kInnerIter = 100; @@ -23,6 +23,8 @@ const float kLowSamplingProb = 0.0001; } void setupBenchmarkCallbacks() { + at::enableRecordFunction(); + at::clearCallbacks(); // non-sampled callback at::addGlobalCallback(at::RecordFunctionCallback( [&](const at::RecordFunction& fn) {}, @@ -40,7 +42,7 @@ void setupBenchmarkCallbacks() { } } -float runBench(int tensor_size, int outer_iter) { +float runTensorBench(int tensor_size, int outer_iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); @@ -54,30 +56,53 @@ float runBench(int tensor_size, int outer_iter) { return duration; } -int main(int argc, char** argv) { - if (!c10::ParseCommandLineFlags(&argc, &argv)) { - std::cout << "Failed to parse command line flags" << std::endl; - return -1; +float runPureRecordFunctionBench(int outer_iter) { + typedef std::chrono::high_resolution_clock clock; + typedef std::chrono::microseconds us; + std::chrono::time_point start_time = clock::now(); + for (auto n = 0; n < outer_iter; ++n) { + RECORD_USER_SCOPE("test"); } + auto duration = static_cast( + std::chrono::duration_cast(clock::now() - start_time).count()); + return duration; +} - at::enableRecordFunction(); - setupBenchmarkCallbacks(); - - auto duration = runBench(kSmallTensorSize, FLAGS_warmup_iter); - std::cout << "Warmup time: " << duration << " us." << std::endl; - +void runBenchmark() { + float duration = 0; for (auto tensor_size : std::set({kSmallTensorSize, kTensorSize})) { - duration = runBench(tensor_size, FLAGS_iter); - std::cout << "Time per iteration (" + duration = runTensorBench(tensor_size, FLAGS_iter); + std::cout << "Running tensor benchmark, time per iteration (" << tensor_size << "x" << tensor_size << "): " << (duration/FLAGS_iter) << " us." << std::endl; } + duration = runPureRecordFunctionBench(FLAGS_iter * 100); + std::cout << "Running pure RecordFunction benchmark, time per iteration: " + << (duration/FLAGS_iter) + << " us." << std::endl; +} + +int main(int argc, char** argv) { + if (!c10::ParseCommandLineFlags(&argc, &argv)) { + std::cout << "Failed to parse command line flags" << std::endl; + return -1; + } + + auto duration = runTensorBench(kSmallTensorSize, FLAGS_warmup_iter); + std::cout << "Warmup time: " << duration << " us." << std::endl; + + setupBenchmarkCallbacks(); + std::cout << "Running with empty observers" << std::endl; + runBenchmark(); at::clearCallbacks(); + std::cout << "Running without observers" << std::endl; + runBenchmark(); + std::cout << "Running sampled observer benchmark" << std::endl; int cb_count = 0; at::addGlobalCallback(at::RecordFunctionCallback( [&](const at::RecordFunction& fn) { @@ -88,18 +113,12 @@ int main(int argc, char** argv) { .samplingProb(kLowSamplingProb) ); - typedef std::chrono::high_resolution_clock clock; - typedef std::chrono::microseconds us; - std::chrono::time_point start_time = clock::now(); - for (auto n = 0; n < FLAGS_rec_fn_iter; ++n) { - RECORD_USER_SCOPE("test"); - } - duration = static_cast( - std::chrono::duration_cast(clock::now() - start_time).count()); - std::cout << "Pure RecordFunction runtime of " << FLAGS_rec_fn_iter + runPureRecordFunctionBench(FLAGS_sampled_iter); + + std::cout << "Pure RecordFunction runtime of " << FLAGS_sampled_iter << " iterations " << duration << " us, number of callback invocations: " << cb_count - << ", expected number: ~" << (int)(FLAGS_rec_fn_iter * kLowSamplingProb) + << ", expected number: ~" << (int)(FLAGS_sampled_iter * kLowSamplingProb) << " invocations" << std::endl; at::clearCallbacks(); diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc index db78467cfb43..09f1cabb8e15 100644 --- a/binaries/speed_benchmark_torch.cc +++ b/binaries/speed_benchmark_torch.cc @@ -24,7 +24,7 @@ #include "torch/csrc/jit/serialization/import.h" #include "torch/script.h" -#include "c10/core/CPUCachingAllocator.h" +#include "c10/mobile/CPUCachingAllocator.h" #include using namespace std::chrono; diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt index 17fd7e680122..48bceb440954 100644 --- a/c10/CMakeLists.txt +++ b/c10/CMakeLists.txt @@ -17,6 +17,7 @@ set(C10_USE_GFLAGS ${USE_GFLAGS}) # used in cmake_macros.h.in set(C10_USE_GLOG ${USE_GLOG}) # used in cmake_macros.h.in set(C10_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # used in cmake_macros.h.in set(C10_USE_NUMA ${USE_NUMA}) +set(C10_USE_MSVC_STATIC_RUNTIME ${CAFFE2_USE_MSVC_STATIC_RUNTIME}) configure_file( ${CMAKE_CURRENT_LIST_DIR}/macros/cmake_macros.h.in ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h) @@ -32,6 +33,7 @@ file(GLOB C10_SRCS core/dispatch/*.cpp core/op_registration/*.cpp core/impl/*.cpp + mobile/*.cpp macros/*.cpp util/*.cpp ) diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index e830aa4832d0..c76fefe21d27 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -1,6 +1,7 @@ #include -#include #include +#include +#include // TODO: rename flags to C10 C10_DEFINE_bool( @@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator { // TODO: enable with better TLS support on mobile // profiledCPUMemoryReporter().Delete(pointer); auto allocator_ptr = GetThreadLocalCachingAllocator(); + auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator(); if (allocator_ptr != nullptr) { allocator_ptr->free(pointer); + } else if (profiling_allocator_ptr != nullptr) { + profiling_allocator_ptr->free(pointer); } else { c10::free_cpu(pointer); // This adds extra cost to freeing memory to the default case when // caching allocator is not enabled. CPUCachingAllocator::record_free(pointer); + auto allocation_planner = GetThreadLocalAllocationPlanner(); + if (allocation_planner != nullptr) { + allocation_planner->record_free(pointer); + } } } @@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator { auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes; void* data; auto allocator_ptr = GetThreadLocalCachingAllocator(); + auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator(); if (allocator_ptr != nullptr) { data = allocator_ptr->allocate(alloc_size); + } else if (profiling_allocator_ptr != nullptr) { + data = profiling_allocator_ptr->allocate(alloc_size); } else { data = c10::alloc_cpu(alloc_size); + auto allocation_planner = GetThreadLocalAllocationPlanner(); + if (allocation_planner != nullptr) { + allocation_planner->record_allocation(alloc_size, data); + } } // profiledCPUMemoryReporter().New(data, alloc_size); return { diff --git a/c10/core/DefaultDtype.cpp b/c10/core/DefaultDtype.cpp index daae181db9d7..c4f420ab6e22 100644 --- a/c10/core/DefaultDtype.cpp +++ b/c10/core/DefaultDtype.cpp @@ -3,11 +3,13 @@ namespace c10 { static auto default_dtype = caffe2::TypeMeta::Make(); +static auto default_dtype_as_scalartype = typeMetaToScalarType(default_dtype); static auto default_complex_dtype = caffe2::TypeMeta::Make>(); void set_default_dtype(caffe2::TypeMeta dtype) { default_dtype = std::move(dtype); - if(dtype == caffe2::TypeMeta::Make()) { + default_dtype_as_scalartype = typeMetaToScalarType(default_dtype); + if(default_dtype_as_scalartype == ScalarType::Double) { default_complex_dtype = std::move(caffe2::TypeMeta::Make>()); } else { default_complex_dtype = std::move(caffe2::TypeMeta::Make>()); @@ -17,6 +19,9 @@ void set_default_dtype(caffe2::TypeMeta dtype) { const caffe2::TypeMeta& get_default_dtype() { return default_dtype; } +ScalarType get_default_dtype_as_scalartype() { + return default_dtype_as_scalartype; +} const caffe2::TypeMeta& get_default_complex_dtype() { return default_complex_dtype; } diff --git a/c10/core/DefaultDtype.h b/c10/core/DefaultDtype.h index 402a6069bfc3..eda34b217727 100644 --- a/c10/core/DefaultDtype.h +++ b/c10/core/DefaultDtype.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace caffe2 { class TypeMeta; @@ -9,5 +10,6 @@ class TypeMeta; namespace c10 { C10_API void set_default_dtype(caffe2::TypeMeta dtype); C10_API const caffe2::TypeMeta& get_default_dtype(); +C10_API ScalarType get_default_dtype_as_scalartype(); C10_API const caffe2::TypeMeta& get_default_complex_dtype(); } // namespace c10 diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index 41980540017c..8f2acebd84f0 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -38,7 +38,8 @@ namespace c10 { _(c10::qint8, QInt8) /* 12 */ \ _(c10::quint8, QUInt8) /* 13 */ \ _(c10::qint32, QInt32) /* 14 */ \ - _(at::BFloat16, BFloat16) /* 15 */ + _(at::BFloat16, BFloat16) /* 15 */ \ + _(c10::quint4x2, QUInt4x2) /* 16 */ // If you want to support ComplexHalf for real, add ComplexHalf @@ -154,7 +155,8 @@ AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType) #define AT_FORALL_QINT_TYPES(_) \ _(c10::qint8, QInt8) \ _(c10::quint8, QUInt8) \ - _(c10::qint32, QInt32) + _(c10::qint32, QInt32) \ + _(c10::quint4x2, QUInt4x2) #define AT_FORALL_COMPLEX_TYPES(_) \ _(c10::complex, ComplexFloat) \ @@ -279,7 +281,7 @@ static inline bool isComplexType(ScalarType t) { static inline bool isQIntType(ScalarType t) { // Don't forget to extend this when adding new QInt types - return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32; + return t == ScalarType:: QInt8 || t == ScalarType::QUInt8 || t == ScalarType::QInt32 || t == ScalarType::QUInt4x2; } static inline ScalarType toQIntType(ScalarType t) { @@ -303,6 +305,8 @@ static inline ScalarType toUnderlying(ScalarType t) { return ScalarType::Char; case ScalarType::QInt32: return ScalarType::Int; + case ScalarType::QUInt4x2: + return ScalarType::Byte; default: return t; } diff --git a/c10/core/TensorOptions.h b/c10/core/TensorOptions.h index a42f4d4284f4..dd92f919662f 100644 --- a/c10/core/TensorOptions.h +++ b/c10/core/TensorOptions.h @@ -17,6 +17,29 @@ #include namespace c10 { + +DispatchKey computeDispatchKey(c10::optional dtype, c10::optional layout, c10::optional device); + +inline ScalarType dtype_or_default(c10::optional dtype) { + return dtype.has_value() ? *dtype : get_default_dtype_as_scalartype(); +} + +inline caffe2::TypeMeta dtype_or_default(c10::optional dtype) { + return dtype.has_value() ? *dtype : get_default_dtype(); +} + +inline Layout layout_or_default(c10::optional layout) { + return layout.has_value() ? *layout : kStrided; +} + +inline Device device_or_default(c10::optional device) { + return device.has_value() ? *device : Device(kCPU); +} + +inline bool pinned_memory_or_default(c10::optional pinned_memory) { + return pinned_memory.has_value() ? *pinned_memory : false; +} + /// A class to encapsulate construction axes of an Tensor. TensorOptions was /// designed to support the Python style API for specifying construction options /// on factory functions, e.g., @@ -228,7 +251,7 @@ struct C10_API TensorOptions { /// Returns the device of the `TensorOptions`. Device device() const noexcept { - return has_device_ ? device_ : Device(kCPU); + return device_or_default(device_opt()); } /// Returns whether the device is specified. @@ -249,7 +272,7 @@ struct C10_API TensorOptions { /// Returns the dtype of the `TensorOptions`. caffe2::TypeMeta dtype() const noexcept { - return has_dtype_ ? dtype_ : get_default_dtype(); + return dtype_or_default(dtype_opt()); } /// Returns whether the dtype is specified. @@ -265,7 +288,7 @@ struct C10_API TensorOptions { /// Returns the layout of the `TensorOptions`. Layout layout() const noexcept { - return has_layout_ ? layout_ : kStrided; + return layout_or_default(layout_opt()); } /// Returns whether the layout is specified. @@ -298,7 +321,7 @@ struct C10_API TensorOptions { /// Returns the `pinned_memory` property of the `TensorOptions`. bool pinned_memory() const noexcept { - return has_pinned_memory_ ? pinned_memory_ : false; + return pinned_memory_or_default(pinned_memory_opt()); } /// Returns whether the `pinned_memory` is specified. @@ -370,65 +393,7 @@ struct C10_API TensorOptions { } inline DispatchKey computeDispatchKey() const { - switch (layout()) { - case Layout::Strided: - switch (device().type()) { - case DeviceType::CPU: { - auto dtype_tmp = typeMetaToScalarType(dtype()); - if (isQIntType(dtype_tmp)) { - return DispatchKey::QuantizedCPU; - } - return DispatchKey::CPU; - } - case DeviceType::CUDA: { - auto dtype_tmp = typeMetaToScalarType(dtype()); - if (isQIntType(dtype_tmp)) { - return DispatchKey::QuantizedCUDA; - } - return DispatchKey::CUDA; - } - case DeviceType::MKLDNN: - return DispatchKey::MKLDNN; - case DeviceType::OPENGL: - return DispatchKey::OpenGL; - case DeviceType::OPENCL: - return DispatchKey::OpenCL; - case DeviceType::IDEEP: - return DispatchKey::IDEEP; - case DeviceType::HIP: - return DispatchKey::HIP; - case DeviceType::FPGA: - return DispatchKey::FPGA; - case DeviceType::MSNPU: - return DispatchKey::MSNPU; - case DeviceType::XLA: - return DispatchKey::XLA; - case DeviceType::Vulkan: - return DispatchKey::Vulkan; - default: - AT_ERROR("Unsupported device type for dense layout: ", device().type()); - } - case Layout::Sparse: - switch (device().type()) { - case DeviceType::CPU: - return DispatchKey::SparseCPU; - case DeviceType::CUDA: - return DispatchKey::SparseCUDA; - case DeviceType::HIP: - return DispatchKey::SparseHIP; - default: - AT_ERROR("Unsupported device type for sparse layout: ", device().type()); - } - case Layout::Mkldnn: - switch (device().type()) { - case DeviceType::CPU: - return DispatchKey::MkldnnCPU; - default: - AT_ERROR("Unsupported device type for mkldnn layout: ", device().type()); - } - default: - AT_ERROR("Unsupported layout: ", layout()); - } + return c10::computeDispatchKey(optTypeMetaToScalarType(dtype_opt()), layout_opt(), device_opt()); } private: @@ -611,13 +576,68 @@ inline std::string toString(const TensorOptions options) { // This is intended to be a centralized location by which we can determine // what an appropriate DispatchKey for a tensor is. -// -// This takes a TensorOptions, rather than just a DeviceType and Layout, because -// we reserve the right to change dispatch based on *any* aspect of -// TensorOptions. WARNING: If you do this, you need to fix the calls -// to computeDispatchKey in caffe2/tensor.h -inline DispatchKey computeDispatchKey(TensorOptions options) { - return options.computeDispatchKey(); +inline DispatchKey computeDispatchKey(c10::optional dtype, c10::optional layout, c10::optional device) { + const auto layout_ = layout_or_default(layout); + const auto device_ = device_or_default(device); + switch (layout_) { + case Layout::Strided: { + const auto dtype_ = dtype_or_default(dtype); + switch (device_.type()) { + case DeviceType::CPU: { + if (isQIntType(dtype_)) { + return DispatchKey::QuantizedCPU; + } + return DispatchKey::CPU; + } + case DeviceType::CUDA: { + if (isQIntType(dtype_)) { + return DispatchKey::QuantizedCUDA; + } + return DispatchKey::CUDA; + } + case DeviceType::MKLDNN: + return DispatchKey::MKLDNN; + case DeviceType::OPENGL: + return DispatchKey::OpenGL; + case DeviceType::OPENCL: + return DispatchKey::OpenCL; + case DeviceType::IDEEP: + return DispatchKey::IDEEP; + case DeviceType::HIP: + return DispatchKey::HIP; + case DeviceType::FPGA: + return DispatchKey::FPGA; + case DeviceType::MSNPU: + return DispatchKey::MSNPU; + case DeviceType::XLA: + return DispatchKey::XLA; + case DeviceType::Vulkan: + return DispatchKey::Vulkan; + default: + AT_ERROR("Unsupported device type for dense layout: ", device_.type()); + } + } + case Layout::Sparse: + switch (device_.type()) { + case DeviceType::CPU: + return DispatchKey::SparseCPU; + case DeviceType::CUDA: + return DispatchKey::SparseCUDA; + case DeviceType::HIP: + return DispatchKey::SparseHIP; + default: + AT_ERROR("Unsupported device type for sparse layout: ", device_.type()); + } + case Layout::Mkldnn: + switch (device_.type()) { + case DeviceType::CPU: + return DispatchKey::MkldnnCPU; + default: + AT_ERROR("Unsupported device type for mkldnn layout: ", device_.type()); + } + default: + AT_ERROR("Unsupported layout: ", layout_); + } } // We deliberately ignore handling AutogradCPU/CUDA/XLA... keys to diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h index 516aebba0747..f7f5b4f867a9 100644 --- a/c10/core/impl/DeviceGuardImplInterface.h +++ b/c10/core/impl/DeviceGuardImplInterface.h @@ -209,7 +209,15 @@ class C10_API DeviceGuardImplRegistrar { static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE(g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl()); inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) { - auto p = device_guard_impl_registry[static_cast(type)].load(); + // Two adjacent int16_t fields DeviceType and DeviceIndex has field access + // miscompiled on NVCC. To workaround this issue, we apply a mask to the + // DeviceType. First check if the DeviceType is 16-bit. + // FB employees can see + // https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/ + // for more details + static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit"); + auto p = device_guard_impl_registry[static_cast(type) & 0xFFFF].load(); + // This seems to be the first place where you make use of a device // when you pass devices to factory functions. Give a nicer error // message in this case. diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 2285a332f709..84542f064c2e 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -896,6 +896,19 @@ class THCCachingAllocator { THCCachingAllocator caching_allocator; +// Returns whether to force all allocations to bypass the caching allocator and +// go straight to cudaMalloc. This setting is useful when debugging GPU memory +// errors, since the caching allocator foils cuda-memcheck. +bool forceUncachedAllocator() { + static bool force_uncached = + getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr; + return force_uncached; +} + +static void uncached_delete(void* ptr) { + C10_CUDA_CHECK(cudaFree(ptr)); +} + // NB: I decided not to fold this into THCCachingAllocator, because the latter // has a lot more methods and it wasn't altogether clear that they should // actually be publicly exposed @@ -904,6 +917,10 @@ struct CudaCachingAllocator : public Allocator { int device; C10_CUDA_CHECK(cudaGetDevice(&device)); void* r = nullptr; + if (forceUncachedAllocator()) { + C10_CUDA_CHECK(cudaMalloc(&r, size)); + return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)}; + } if (size != 0) { caching_allocator.malloc(&r, device, size, cuda::getCurrentCUDAStream(device)); } diff --git a/c10/macros/Export.h b/c10/macros/Export.h index 5888207c5f80..966dd22e08fa 100644 --- a/c10/macros/Export.h +++ b/c10/macros/Export.h @@ -113,8 +113,8 @@ #define TORCH_HIP_API C10_IMPORT #endif -// Enums only need to be exported on windows -#ifdef _WIN32 +// Enums only need to be exported on windows for non-CUDA files +#if defined(_WIN32) && defined(__CUDACC__) #define C10_API_ENUM C10_API #else #define C10_API_ENUM diff --git a/c10/macros/cmake_macros.h.in b/c10/macros/cmake_macros.h.in index 5e42506f20dc..2845fa1cd8d2 100644 --- a/c10/macros/cmake_macros.h.in +++ b/c10/macros/cmake_macros.h.in @@ -8,6 +8,7 @@ #cmakedefine C10_USE_GLOG #cmakedefine C10_USE_GFLAGS #cmakedefine C10_USE_NUMA +#cmakedefine C10_USE_MSVC_STATIC_RUNTIME // Used by libtorch mobile build to enable features that are not enabled by // caffe2 mobile build. Should only use it when necessary as we are committed diff --git a/c10/core/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp similarity index 98% rename from c10/core/CPUCachingAllocator.cpp rename to c10/mobile/CPUCachingAllocator.cpp index 232b8f2306e2..b2f193299089 100644 --- a/c10/core/CPUCachingAllocator.cpp +++ b/c10/mobile/CPUCachingAllocator.cpp @@ -1,4 +1,4 @@ -#include +#include namespace c10 { diff --git a/c10/core/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h similarity index 71% rename from c10/core/CPUCachingAllocator.h rename to c10/mobile/CPUCachingAllocator.h index ac5f3a95c881..6a748f4f1791 100644 --- a/c10/core/CPUCachingAllocator.h +++ b/c10/mobile/CPUCachingAllocator.h @@ -10,6 +10,38 @@ #include #include +/* + * CPUCachingAllocator: + * DISCLAIMER: + * This is subject to change (beta) and only supported on mobile builds. + * If code snippet such as in 'Usage pattern' is used outside of mobile + * build you will not observe the intended behavior. + * See below for more information. + * Why? + * It has been observed that some mobile platforms, such as pixel 3, return + * memory aggressively to the system. This results in page faults in some cases + * and ends up hurting performance. This caching allocator aims to address that. + * Furthermore it also allows users to specify their own allocator by implementing + * allocate/free virtual interfaces. + * What are the cons? + * There are some cons that were observed where use of caching allocator led to + * worse performance on some platforms. Reason being that the caching mechanism + * used by this allocator left us worse off compared to the corresonding platform's + * tuned memory allocator. In that case it seemed better to not use this allocator. + * Note there are some ideas to fix this in the works. + * + * Usage: + * Usage pattern: + * Instantiate and own the caching allocator. + * std::unique_ptr caching_allocator = + * std::make_unique(); + * Use caching allocator with a scoped guard at inference time. + * { + * WithCPUCachingAllocatorGuard(caching_allocator.get()); + * ... model.forward(...); + * } + */ + namespace c10 { class C10_API CPUCachingAllocator { @@ -64,16 +96,6 @@ CPUCachingAllocator* GetDefaultCPUCachingAllocator(); bool ThreadLocalCachingAllocatorEnabled(); CPUCachingAllocator* GetThreadLocalCachingAllocator(); -/* - * Usage pattern: - * std::unique_ptr caching_allocator = - * std::make_unique(); - * { - * WithCPUCachingAllocatorGuard(caching_allocator.get()); - * ... - * } - */ - class C10_API WithCPUCachingAllocatorGuard { public: WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator); diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp new file mode 100644 index 000000000000..3559c8ce280f --- /dev/null +++ b/c10/mobile/CPUProfilingAllocator.cpp @@ -0,0 +1,410 @@ +#include + +#include + +namespace c10 { + +namespace { +thread_local AllocationPlanner* allocation_planner{nullptr}; +thread_local CPUProfilingAllocator* profiling_allocator{nullptr}; + +struct MemBlock { + uint64_t start_offset, end_offset; + MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {} + bool operator<(const MemBlock& other) const { + return end_offset <= other.start_offset; + } +}; + +bool validate_allocation_plan( + const std::vector& allocation_sizes, + const std::vector& allocation_offsets) { + std::set allocations; + for (uint64_t i = 0; i < allocation_sizes.size(); ++i) { + // Skip allocations not managed by AllocationPlan + if (allocation_offsets[i] == std::numeric_limits::max()) { + continue; + } + auto start_offset = allocation_offsets[i]; + auto end_offset = allocation_offsets[i] + allocation_sizes[i]; + if (!allocations.emplace(start_offset, end_offset).second) { + return false; + } + } + return true; +} + +enum class EventType { + Allocate = 0, + Free, + Invalid +}; + +struct MemEvent { + uint64_t time; + uint64_t allocation_id; + uint64_t size; + EventType type{EventType::Invalid}; + MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) : + time(t), allocation_id(id), size(s), type(e) {} +}; + +std::vector create_and_sort_mem_events( + const std::vector& allocation_sizes, + const std::vector& allocation_lifetimes) { + std::vector events; + for (uint64_t i = 0; i < allocation_sizes.size(); ++i) { + // If observed allocation are freed outside the scope of + // observation, then allocations are not managed by the + // AllocationPlan. + if (allocation_lifetimes[i] == std::numeric_limits::max()) { + continue; + } + events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate); + events.emplace_back(allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free); + } + std::sort( + events.begin(), + events.end(), + [](const MemEvent& a, + const MemEvent& b) -> bool {return a.time < b.time;}); + return events; +} + +std::vector formulate_greedy_allocation_plan( + const std::vector& allocation_sizes, + const std::vector& allocation_lifetimes) { + // Step 1. Construct all allocation/free events. + // Sort these events by timestamp. + // Step 2. Iterate through all events. + // 2.1 If allocate event: + // Find all candidate in free_size_to_offset map + // Greedily pick the first one. + // Remove the entry from free_size_to_offset map. + // new_offset = offset + request_size + // new_size = size - request_size + // Add new entry to both maps + // 2.2 If free event. + // Check if the returned offset merges with another chunk. + // If so merge until no more merging is possible. + // If returned offset does not merge, then + // just return it as a chunk. + + // lower_bound on this map will get all candidates of + // the right size for allocation. + std::map free_size_to_offset; + // This provides fast lookup when we want to insert freed block + // back, especially when we want to merge blocks. + ska::flat_hash_map::iterator> free_start_offset_to_size_iter; + ska::flat_hash_map::iterator> free_end_offset_to_size_iter; + // Upon free end_ptr = offset + size + // If end_ptr exists merge freed allocation + // Also find coresponding offset in size_to_offet + // Remove that entry and update with new size and offset + // If end_ptr does not exist then just insert offset,size + // in map and correspondingly size, offset in the other map. + // Merging should always be done recursively until no more chunks + // that can be found. + // After last free we should have only one entry left in these maps. + ska::flat_hash_map allocated_offset_to_size; + + std::vector allocation_offsets( + allocation_sizes.size(), std::numeric_limits::max()); + auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes); + uint64_t max_offset{0}; + for (const auto& mem_event : mem_events) { + uint64_t alloc_offset; + uint64_t new_offset, new_size; + if (mem_event.type == EventType::Allocate) { + auto it = free_size_to_offset.lower_bound(mem_event.size); + if (it == free_size_to_offset.end()) { + // If there is no contiguous block of the size requested + // allocate a new one. + alloc_offset = max_offset; + max_offset += mem_event.size; + allocated_offset_to_size.emplace(alloc_offset, mem_event.size); + } else { + // If we have found a block of the size we want + // 1. change the block by allocating out of it. + // 1.1 Erase the entire block + // 1.2 Erase the reverse map entries + // 2. If block still has space left insert the remainder back in map. + // Including reverse map entries. + // 3. Insert the allocated block in allocated_offset_to_size. + alloc_offset = it->second; + new_offset = alloc_offset + mem_event.size; + new_size = it->first - mem_event.size; + free_size_to_offset.erase(it); + free_start_offset_to_size_iter.erase(alloc_offset); + free_end_offset_to_size_iter.erase(alloc_offset + it->first); + if (new_size > 0) { + auto ref_it = free_size_to_offset.emplace(new_offset, new_size).first; + free_start_offset_to_size_iter.emplace(new_offset, ref_it); + free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it); + } + allocated_offset_to_size.emplace(alloc_offset, mem_event.size); + } + allocation_offsets[mem_event.allocation_id] = alloc_offset; + } else { + // 1. Check if freed block is adjancent to an existing free block + // at its end boundary. This is done by checking + // free_end_offset_to_size_iter. + // If we find such a block, remove it and adjust size of + // the block being freed. + // 2. Similarly check if freed block is adjacent to an existing + // free block at start boundary. This is done by checking + // free_start_offset_to_size_iter. + // If we find such a block, remove it and adjust size of + // the block being freed. + // 3. Inser the freed block in map. + auto freed_offset = allocation_offsets[mem_event.allocation_id]; + auto freed_size = mem_event.size; + auto end_offset = freed_offset + freed_size; + // Merge when another free block exist at the end of this block + auto end_it = free_end_offset_to_size_iter.find(end_offset); + if (end_it != free_end_offset_to_size_iter.end()) { + auto size_to_end_offset_iter = end_it->second; + freed_size += size_to_end_offset_iter->first; + free_size_to_offset.erase(size_to_end_offset_iter); + free_end_offset_to_size_iter.erase(end_it); + } + // Merge when freed block exist at the end of another free block + auto start_it = free_start_offset_to_size_iter.find(freed_offset); + if (start_it != free_start_offset_to_size_iter.end()) { + auto size_to_start_offset_iter = start_it->second; + freed_size += size_to_start_offset_iter->first; + freed_offset -= size_to_start_offset_iter->first; + free_size_to_offset.erase(size_to_start_offset_iter); + free_start_offset_to_size_iter.erase(start_it); + } + allocated_offset_to_size.erase(freed_offset); + auto freed_block_it = + free_size_to_offset.emplace(freed_size, freed_offset).first; + free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it); + free_end_offset_to_size_iter.emplace( + freed_offset + freed_size, freed_block_it); + } + } + TORCH_CHECK(validate_allocation_plan(allocation_sizes, allocation_offsets), + "Allocation plan invaild."); + return allocation_offsets; +} + +} // namespace + +void AllocationPlan::clear() { + allocation_sizes.clear(); + allocation_lifetimes.clear(); + allocation_offsets.clear(); +} + +void AllocationPlanner::record_allocation( + const uint64_t size, const void* ptr) { + if (validation_mode_) { + validation_success = validation_success && validate_allocation(size, ptr); + return; + } + allocation_plan_->allocation_sizes.push_back(size); + allocation_plan_->allocation_lifetimes.push_back( + std::numeric_limits::max()); + allocation_ptr_to_id_.emplace(ptr, allocation_id_); + allocation_id_++; +} + +void AllocationPlanner::record_free(const void* ptr) { + if (validation_mode_) { + validation_success = validation_success && validate_free(ptr); + return; + } + auto it = allocation_ptr_to_id_.find(ptr); + if (it == allocation_ptr_to_id_.end()) { + // Free being recorded was allocated outside of WithProfileAllocationGuard + return; + } + auto id = it->second; + TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(), + "Allocation must have been recorded during record_allocation."); + allocation_plan_->allocation_lifetimes[id] = allocation_id_; +} + +bool AllocationPlanner::validate_allocation( + const uint64_t size, const void* ptr) { + if (allocation_id_ >= allocation_plan_->allocation_sizes.size() || + allocation_plan_->allocation_sizes[allocation_id_] != size) { + TORCH_WARN( + "Allocation request does not match plan:", + "Allocation id:", + allocation_id_, + ", Number of recorded allocations:", + allocation_plan_->allocation_sizes.size(), + ", Recorded size of the requested allocation:", + allocation_plan_->allocation_sizes[allocation_id_], + ", but got:", + size); + + return false; + } + allocation_ptr_to_id_.emplace(ptr, allocation_id_); + allocation_id_++; + return true; +} + +bool AllocationPlanner::validate_free(const void* ptr) { + auto it = allocation_ptr_to_id_.find(ptr); + if (it == allocation_ptr_to_id_.end()) { + // Allocation that was made outside the validation scope is being freed here + return true; + } + auto id = (*it).second; + TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(), + "Allocation must have been recorded during validate_allocation."); + auto lifetime_id = allocation_plan_->allocation_lifetimes[id]; + return (lifetime_id == allocation_id_); +} + +void AllocationPlanner::formulate_plan() { + allocation_plan_->allocation_offsets = + formulate_greedy_allocation_plan( + allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes); + allocation_plan_->total_size = 0; + for (auto i = 0; i < allocation_plan_->allocation_sizes.size(); ++i) { + if (allocation_plan_->allocation_lifetimes[i] == + std::numeric_limits::max()) { + continue; + } + auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i]; + allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit); + } +} + +void AllocationPlanner::clear() { + allocation_plan_->clear(); + allocation_ptr_to_id_.clear(); +} + +void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) { + TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr."); + plan_ = plan; + allocation_id_ = 0; + allocation_ptr_to_id_.clear(); + if (current_size_ < plan->total_size) { + // Free existing memory and reallocate for larger size. + c10::free_cpu(blob_); + blob_ = c10::alloc_cpu(plan->total_size); + current_size_ = plan->total_size; + } +} + +void CPUProfilingAllocator::unset_plan() { + allocation_id_ = 0; + allocation_ptr_to_id_.clear(); + plan_ = nullptr; +} + +void* CPUProfilingAllocator::allocate(const size_t bytes) { + TORCH_CHECK(bytes == plan_->allocation_sizes[allocation_id_], + "Got allocation request that does not match with the plan."); + if (plan_->allocation_lifetimes[allocation_id_] == + std::numeric_limits::max()) { + // This allocation is not managed by ProfilingAllocator. + allocation_id_++; + return c10::alloc_cpu(bytes); + } + void* ptr = + reinterpret_cast(blob_) + + plan_->allocation_offsets[allocation_id_]; + TORCH_CHECK(allocation_ptr_to_id_.emplace(ptr, allocation_id_).second); + allocation_id_++; + return ptr; +} + +void CPUProfilingAllocator::free(void* const ptr) { + auto it = allocation_ptr_to_id_.find(ptr); + if (it == allocation_ptr_to_id_.end()) { + // Either + // 1. Allocation that was made outside the validation scope is being freed here + // or + // 2. Allocation that is not managed by profiling allocator is being freed. + // Example of the second type + // Tensor out; + // for (....) { + // { + // CPUProfilingAllocator + // out = ...some op (This also frees previous memory held by out) + // } + // out is used.. + // } + c10::free_cpu(ptr); + return; + } + auto id = it->second; + TORCH_CHECK(id < plan_->allocation_lifetimes.size(), + "Freeing allocation that is not accordingly to the plan."); + auto lifetime_id = plan_->allocation_lifetimes[id]; + TORCH_CHECK( + lifetime_id == allocation_id_, + "Lifetime of allocations do not match: allocation_id ", + id, + ", expected:", + lifetime_id, + ", got:", + allocation_id_); +} + +CPUProfilingAllocator::~CPUProfilingAllocator() { + c10::free_cpu(blob_); +} + +WithProfileAllocationsGuard::WithProfileAllocationsGuard( + AllocationPlan* plan) { + // Nesting of allocation profiling does not seem meanigful. + TORCH_CHECK(allocation_planner == nullptr, + "Nesting profiling allocations is not supported."); + planner_ = std::make_unique(plan); + planner_->clear(); + allocation_planner = planner_.get(); +} + +WithProfileAllocationsGuard::~WithProfileAllocationsGuard() { + planner_->formulate_plan(); + allocation_planner = nullptr; +} + +WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard( + AllocationPlan* plan, bool* success) { + // Nesting of allocation profiling does not seem meanigful. + TORCH_CHECK(allocation_planner == nullptr, + "Nesting profiling allocations is not supported."); + planner_ = std::make_unique(plan, true); + success_ = success; + allocation_planner = planner_.get(); +} + +WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() { + *success_ = planner_->validation_success; + allocation_planner = nullptr; +} + +AllocationPlanner* GetThreadLocalAllocationPlanner() { + return allocation_planner; +} + +WithProfilingAllocatorGuard::WithProfilingAllocatorGuard( + CPUProfilingAllocator* allocator, const AllocationPlan* plan) { + // Nesting of profiling allocator is not supported. + TORCH_CHECK(profiling_allocator == nullptr, + "Nesting profiling allocators is not supported."); + profiling_allocator = allocator; + profiling_allocator->set_plan(plan); +} + +WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() { + profiling_allocator->unset_plan(); + profiling_allocator = nullptr; +} + +CPUProfilingAllocator* GetThreadLocalProfilingAllocator() { + return profiling_allocator; +} + +} // namespace c10 diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h new file mode 100644 index 000000000000..4a7e79fe2857 --- /dev/null +++ b/c10/mobile/CPUProfilingAllocator.h @@ -0,0 +1,149 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +/* + * Given a sequence of allocations in a thread, AllocationPlan records + * 1. size of each allocation + * 2. Lifetime of each allocation. + * 3. allocation offsets: Memory offset for each allocation in a single blob of memory + * 4. Total size of a blob of memory required to satisfy all the allocations. + */ +class C10_API AllocationPlan { + private: + // Records size of each allocation by their sequential allocation ids. + std::vector allocation_sizes; + // This maps one allocation id (X) to another allocation id (Y). + // Allocation X is alive until allocation Y. From allocation Y onwards + // allocation X is not referenced. + // Thus Y is the id of the first allocation after X is freed. + // NB: When an allocation is recorded, along with recording its size, + // we also set the lifetime to be numeric_limits::max() + // This is to track allocations that are made during the scope of + // profiling but were not freed until after the scope ended. + // Such allocations are not managed by profiling allocator. + std::vector allocation_lifetimes; + // Maps an allocation to some offset in a blob of memory. + std::vector allocation_offsets; + uint64_t total_size{0}; + void clear(); + friend class AllocationPlanner; + friend class CPUProfilingAllocator; +}; + +/* + * Map of memory ptr to allocation id. This is auxiliary information only + * used to establish lifetime of allocations. + */ +class C10_API AllocationPlanner { + private: + AllocationPlan* allocation_plan_{nullptr}; + // Maps allocated ptr to its allocation id. + // This is used when freeing the memory to lookup the allocation id + // in order to establish the lifetime of a particular allocation. + ska::flat_hash_map allocation_ptr_to_id_; + uint64_t allocation_id_{0}; + bool validation_mode_{false}; + + bool validate_allocation(const uint64_t size, const void* ptr); + bool validate_free(const void* ptr); + public: + bool validation_success{true}; + + AllocationPlanner() = delete; + AllocationPlanner(AllocationPlan* plan, bool validate = false) : + allocation_plan_(plan), validation_mode_(validate) {} + void record_allocation(const uint64_t size, const void* ptr); + void record_free(const void* ptr); + void formulate_plan(); + void clear(); +}; + +// NOT THREAD SAFE profiling allocator. +class C10_API CPUProfilingAllocator { + private: + const AllocationPlan* plan_{nullptr}; + uint64_t allocation_id_{0}; + uint64_t current_size_{0}; + void* blob_{nullptr}; + ska::flat_hash_map allocation_ptr_to_id_; + public: + ~CPUProfilingAllocator(); + void set_plan(const AllocationPlan* plan); + void unset_plan(); + void* allocate(const size_t bytes); + void free(void* const ptr); +}; + +/* + * Usage: Profile allocations made by one run of the model. + * AllocationPlan plan; + * { + * WithProfileAllocationGuard profile_guard(&plan); + * module.forward(...); + * } + * plan now contains allocation plan. + */ +class C10_API WithProfileAllocationsGuard { + public: + WithProfileAllocationsGuard(AllocationPlan* plan); + ~WithProfileAllocationsGuard(); + private: + std::unique_ptr planner_; +}; + +/* + * Usage: Validate allocation plan made with WithProfileAllocationGuard + * bool plan_validation_success, success = true; + * for (some number of representative inputs) + * { + * WithValidateAllocationPlanGuard(&plan, &plan_validation_success); + * module.forward(...); + * success = success && plan_validation_success; + * } + * success == true means allocations are according to plan + * else for some inputs allocation pattern changed. + */ +class C10_API WithValidateAllocationPlanGuard { + public: + WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success); + ~WithValidateAllocationPlanGuard(); + private: + std::unique_ptr planner_; + bool* success_; +}; + +AllocationPlanner* GetThreadLocalAllocationPlanner(); + +/* + * Usage: Allocate tensors accordingly to allocation plan + * First make allocation plan. + * See WithProfileAllocationsGuard usage. + * Second validate allocation plan. + * See WithValidateAllocationPlanGuard usage. + * CPUProfilingAllocator profiling_allocator; + * { + * WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan); + * module.forward(...); + * } + */ +class C10_API WithProfilingAllocatorGuard { + public: + WithProfilingAllocatorGuard( + CPUProfilingAllocator* allocator, const AllocationPlan* plan); + ~WithProfilingAllocatorGuard(); +}; + +CPUProfilingAllocator* GetThreadLocalProfilingAllocator(); + +} // namespace c10 diff --git a/c10/util/BFloat16-inl.h b/c10/util/BFloat16-inl.h index da6ce3859552..57e2a69b86fb 100644 --- a/c10/util/BFloat16-inl.h +++ b/c10/util/BFloat16-inl.h @@ -7,15 +7,44 @@ namespace c10 { /// Constructors inline C10_HOST_DEVICE BFloat16::BFloat16(float value) { +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + x = __bfloat16_as_ushort(__float2bfloat16(value)); +#else // RNE by default x = detail::round_to_nearest_even(value); +#endif } /// Implicit conversions inline C10_HOST_DEVICE BFloat16::operator float() const { +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + return __bfloat162float(*reinterpret_cast(&x)); +#else return detail::f32_from_bits(x); +#endif } +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) { + x = *reinterpret_cast(&value); +} +inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const { + return *reinterpret_cast(&x); +} +#endif + +// CUDA intrinsics + +#if defined(__CUDACC__) || defined(__HIPCC__) +inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) { +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __ldg(reinterpret_cast(ptr)); +#else + return *ptr; +#endif +} +#endif + /// Arithmetic inline C10_HOST_DEVICE BFloat16 operator+(const BFloat16& a, const BFloat16& b) { diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h index 375b1086e073..0bd115d568f6 100644 --- a/c10/util/BFloat16.h +++ b/c10/util/BFloat16.h @@ -7,6 +7,10 @@ #include #include +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#include +#endif + namespace c10 { namespace detail { @@ -84,6 +88,11 @@ struct alignas(2) BFloat16 { constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t) : x(bits){}; inline C10_HOST_DEVICE BFloat16(float value); inline C10_HOST_DEVICE operator float() const; + +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value); + explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const; +#endif }; } // namespace c10 diff --git a/c10/util/ThreadLocalDebugInfo.cpp b/c10/util/ThreadLocalDebugInfo.cpp index a9cdc26b5934..20d473667a8d 100644 --- a/c10/util/ThreadLocalDebugInfo.cpp +++ b/c10/util/ThreadLocalDebugInfo.cpp @@ -51,6 +51,15 @@ std::shared_ptr ThreadLocalDebugInfo::_pop(DebugInfoKind kind) { return res->info_; } +/* static */ +std::shared_ptr ThreadLocalDebugInfo::_peek(DebugInfoKind kind) { + TORCH_CHECK( + debug_info && debug_info->kind_ == kind, + "Expected debug info of type ", + (size_t)kind); + return debug_info->info_; +} + DebugInfoGuard::DebugInfoGuard( DebugInfoKind kind, std::shared_ptr info) { diff --git a/c10/util/ThreadLocalDebugInfo.h b/c10/util/ThreadLocalDebugInfo.h index 207abed781b0..9620cfb9fdea 100644 --- a/c10/util/ThreadLocalDebugInfo.h +++ b/c10/util/ThreadLocalDebugInfo.h @@ -46,6 +46,9 @@ class C10_API ThreadLocalDebugInfo { // Pop debug info, throws in case the last pushed // debug info is not of a given kind static std::shared_ptr _pop(DebugInfoKind kind); + // Peek debug info, throws in case the last pushed debug info is not of the + // given kind + static std::shared_ptr _peek(DebugInfoKind kind); private: std::shared_ptr info_; diff --git a/c10/util/complex.h b/c10/util/complex.h index 53ec4f30e539..9c63a2b296fb 100644 --- a/c10/util/complex.h +++ b/c10/util/complex.h @@ -257,6 +257,11 @@ struct alignas(sizeof(T) * 2) complex { } #endif + // consistent with NumPy behavior + explicit constexpr operator bool() const { + return real() || imag(); + } + constexpr T real() const { return real_; } diff --git a/c10/util/quint4x2.h b/c10/util/quint4x2.h new file mode 100644 index 000000000000..c2502b561409 --- /dev/null +++ b/c10/util/quint4x2.h @@ -0,0 +1,18 @@ +#pragma once +#include + +#include + +namespace c10 { + +/** + * quint4x2 is for un-signed 4 bit quantized Tensors that are packed to byte boundary. + */ +struct alignas(1) quint4x2 { + using underlying = uint8_t; + uint8_t val_; + quint4x2() = default; + C10_HOST_DEVICE explicit quint4x2(uint8_t val) : val_(val) {} +}; + +} // namespace c10 diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp index e97eaa843979..e2070a1584a2 100644 --- a/c10/util/typeid.cpp +++ b/c10/util/typeid.cpp @@ -88,5 +88,6 @@ CAFFE_KNOWN_TYPE(c10::qint8) CAFFE_KNOWN_TYPE(c10::quint8) CAFFE_KNOWN_TYPE(c10::qint32) CAFFE_KNOWN_TYPE(at::BFloat16) +CAFFE_KNOWN_TYPE(c10::quint4x2) } // namespace caffe2 diff --git a/c10/util/typeid.h b/c10/util/typeid.h index 62a0bdfc6644..51833fb545ad 100644 --- a/c10/util/typeid.h +++ b/c10/util/typeid.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 65f072b6f29d..318e46a44f54 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -6,7 +6,7 @@ if(USE_VULKAN) include(../cmake/VulkanCodegen.cmake) endif() -# ---[ MSVC OpenMP modification +# ---[ MSVC OpenMP modification if(MSVC) include(../cmake/public/utils.cmake) endif() @@ -111,7 +111,7 @@ endif() add_subdirectory(core) add_subdirectory(serialize) add_subdirectory(utils) -if(BUILD_CAFFE2) +if(BUILD_CAFFE2 OR (NOT USE_FBGEMM)) add_subdirectory(perfkernels) endif() @@ -291,26 +291,29 @@ endif() if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) if(USE_DISTRIBUTED) - add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") - target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) - add_dependencies(process_group_agent torch c10d) # Define this target even if we're building without TensorPipe, to make life # easier to other targets that depend on this. However, in that case, by not # setting the USE_TENSORPIPE compile definition, this target will just end # up being empty. Downstream targets should also add a #ifdef guard. - add_library(tensorpipe_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" - ) - target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) - add_dependencies(tensorpipe_agent torch c10d) - if(USE_TENSORPIPE) - target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) - target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) - add_dependencies(tensorpipe_agent tensorpipe) + if(NOT WIN32) + add_library(process_group_agent "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h") + target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) + add_dependencies(process_group_agent torch c10d) + + add_library(tensorpipe_agent + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" + ) + target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) + add_dependencies(tensorpipe_agent torch c10d) + if(USE_TENSORPIPE) + target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) + target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) + add_dependencies(tensorpipe_agent tensorpipe) + endif() endif() endif() @@ -493,7 +496,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" ) endif() - if(USE_DISTRIBUTED) + if(USE_DISTRIBUTED AND NOT WIN32) append_filelist("libtorch_distributed_sources" TORCH_SRCS) endif() endif() @@ -506,6 +509,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/cuda/comm.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/arith.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/compute_at.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/codegen.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/dispatch.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/expr_evaluator.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/executor.cpp @@ -515,6 +519,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/fusion.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/graph_fuser.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/index_compute.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/instrumentation.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_base_nodes.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_cloner.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -524,7 +529,9 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_ir_builder.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_index.cpp + ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_insert_syncs.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_thread_predicate.cpp ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_unroll.cpp @@ -834,10 +841,10 @@ endif() DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch) - if(BUILD_TEST AND NOT USE_ROCM) + if(BUILD_TEST) add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit) add_subdirectory(${TORCH_ROOT}/test/cpp/tensorexpr ${CMAKE_BINARY_DIR}/test_tensorexpr) - if(USE_DISTRIBUTED) + if(USE_DISTRIBUTED AND NOT WIN32) add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc) endif() endif() @@ -889,9 +896,7 @@ endif() DESTINATION share/cmake/Torch) if(USE_DISTRIBUTED) - if(NOT MSVC) - add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) - endif() + add_subdirectory(${TORCH_SRC_DIR}/lib/c10d lib_c10d) endif() @@ -966,6 +971,14 @@ if(USE_DISTRIBUTED) target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED ) + # Pass USE_RPC in order to reduce use of + # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) + # need to be removed when RPC is supported + if(NOT WIN32) + target_compile_definitions(torch_cpu PRIVATE + USE_RPC + ) + endif() # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp # can only be compiled with USE_TENSORPIPE is set. if(USE_TENSORPIPE) @@ -1243,7 +1256,9 @@ endif() if(BUILD_STATIC_RUNTIME_BENCHMARK) add_subdirectory(${TORCH_ROOT}/benchmarks/static_runtime ${PROJECT_BINARY_DIR}/bin) add_executable(static_runtime_bench "${STATIC_RUNTIME_BENCHMARK_SRCS}") + add_executable(static_runtime_test "${STATIC_RUNTIME_TEST_SRCS}") target_link_libraries(static_runtime_bench torch_library benchmark) + target_link_libraries(static_runtime_test torch_library gtest_main) endif() if(BUILD_MOBILE_BENCHMARK) @@ -1276,8 +1291,8 @@ if(BUILD_TEST) foreach(test_src ${ATen_VEC256_TEST_SRCS}) foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES}) get_filename_component(test_name ${test_src} NAME_WE) - list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY) - list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS) + list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY) + list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS) separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}") add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}") target_link_libraries(${test_name}_${CPU_CAPABILITY} torch_library gtest_main) @@ -1287,7 +1302,7 @@ if(BUILD_TEST) target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY}) target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS}) if(NOT MSVC) - target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE -Wno-ignored-qualifiers) + target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE -Wno-ignored-qualifiers) endif(NOT MSVC) add_test(NAME ${test_name}_${CPU_CAPABILITY} COMMAND $) endforeach() diff --git a/caffe2/contrib/aten/aten_test.py b/caffe2/contrib/aten/aten_test.py index 92448fe355de..d9d99a1c1ae9 100644 --- a/caffe2/contrib/aten/aten_test.py +++ b/caffe2/contrib/aten/aten_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, dyndep from hypothesis import given diff --git a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py index a8979ca63aa6..94a76fed85f5 100644 --- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py index 1a4f57b6aa05..7b1b5f070171 100644 --- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py index 511c29884288..b7a9fc810cfc 100644 --- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py +++ b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py index bb013a26a609..7a68af63a84b 100644 --- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py index 22e78b0756c0..45757badba43 100644 --- a/caffe2/contrib/fakelowp/test/test_fusions.py +++ b/caffe2/contrib/fakelowp/test/test_fusions.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + # Must happen before importing caffe2.python.* import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py index 4c82917f042c..5a91a00706ff 100644 --- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py +++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.fakelowp.init_shared_libs # noqa import numpy as np diff --git a/caffe2/contrib/fakelowp/test/test_int8_quant.py b/caffe2/contrib/fakelowp/test/test_int8_quant.py index 83d0cc176def..02095286e1ee 100644 --- a/caffe2/contrib/fakelowp/test/test_int8_quant.py +++ b/caffe2/contrib/fakelowp/test/test_int8_quant.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + # Must happen before importing caffe2.python.* import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py index 698b839f3785..9ff0986116b6 100644 --- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import caffe2.python.fakelowp.init_shared_libs # noqa diff --git a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py index 58161409fa80..e8512b4dcd74 100644 --- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py index 0ca76bd86ba9..a8d6640fa58e 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py index da7eae2708f3..f8fd03cbfb73 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import unittest diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py index ad26952a901c..207403f1bd0d 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py +++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import unittest diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py index 8eaff9e137ae..fbca9b8fe64c 100644 --- a/caffe2/contrib/gloo/gloo_test.py +++ b/caffe2/contrib/gloo/gloo_test.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/contrib/nccl/nccl_ops_test.py b/caffe2/contrib/nccl/nccl_ops_test.py index 3f4685548281..2d4e9b518b9b 100644 --- a/caffe2/contrib/nccl/nccl_ops_test.py +++ b/caffe2/contrib/nccl/nccl_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/contrib/nnpack/nnpack_ops_test.py b/caffe2/contrib/nnpack/nnpack_ops_test.py index b12acd151a71..4bedf0e0ecd6 100644 --- a/caffe2/contrib/nnpack/nnpack_ops_test.py +++ b/caffe2/contrib/nnpack/nnpack_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/contrib/playground/AnyExp.py b/caffe2/contrib/playground/AnyExp.py index 5d968b0455fc..b8e2f8b37b2a 100644 --- a/caffe2/contrib/playground/AnyExp.py +++ b/caffe2/contrib/playground/AnyExp.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from abc import abstractmethod diff --git a/caffe2/contrib/playground/AnyExpOnTerm.py b/caffe2/contrib/playground/AnyExpOnTerm.py index b269777da675..dcfe61f14545 100644 --- a/caffe2/contrib/playground/AnyExpOnTerm.py +++ b/caffe2/contrib/playground/AnyExpOnTerm.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import json diff --git a/caffe2/contrib/playground/ModuleRegister.py b/caffe2/contrib/playground/ModuleRegister.py index 89a9deb8989e..27e0c07f6384 100644 --- a/caffe2/contrib/playground/ModuleRegister.py +++ b/caffe2/contrib/playground/ModuleRegister.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import inspect import logging diff --git a/caffe2/contrib/playground/checkpoint.py b/caffe2/contrib/playground/checkpoint.py index 9887a408cc01..5ea3d2a9035c 100644 --- a/caffe2/contrib/playground/checkpoint.py +++ b/caffe2/contrib/playground/checkpoint.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import pickle diff --git a/caffe2/contrib/playground/compute_loss.py b/caffe2/contrib/playground/compute_loss.py index 53eb77d77701..2965ff3895ac 100644 --- a/caffe2/contrib/playground/compute_loss.py +++ b/caffe2/contrib/playground/compute_loss.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.contrib.playground.meter as Meter from caffe2.python import workspace diff --git a/caffe2/contrib/playground/compute_topk_accuracy.py b/caffe2/contrib/playground/compute_topk_accuracy.py index 396b797ed1b6..e2f148231c6d 100644 --- a/caffe2/contrib/playground/compute_topk_accuracy.py +++ b/caffe2/contrib/playground/compute_topk_accuracy.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.contrib.playground.meter as Meter from caffe2.python import workspace diff --git a/caffe2/contrib/playground/meter.py b/caffe2/contrib/playground/meter.py index 7e109e445d04..ed0158bbf087 100644 --- a/caffe2/contrib/playground/meter.py +++ b/caffe2/contrib/playground/meter.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from abc import abstractmethod diff --git a/caffe2/contrib/playground/module_map.py b/caffe2/contrib/playground/module_map.py index 0f5de5943a36..8eb1a3a00cdc 100644 --- a/caffe2/contrib/playground/module_map.py +++ b/caffe2/contrib/playground/module_map.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + # Input import caffe2.contrib.playground.resnetdemo.\ diff --git a/caffe2/contrib/playground/output_generator.py b/caffe2/contrib/playground/output_generator.py index 41d8e3fdfae4..aaa977c08faa 100644 --- a/caffe2/contrib/playground/output_generator.py +++ b/caffe2/contrib/playground/output_generator.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import timeout_guard diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py index 52ce95ed5dab..58085dbc3721 100644 --- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py +++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py index cf893b598446..480070752e63 100644 --- a/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py +++ b/caffe2/contrib/playground/resnetdemo/IN1k_resnet_no_test_model.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py index 174ffe1e034a..fa0fedd84a8c 100644 --- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py +++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_forward.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python.models.resnet as resnet diff --git a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py index 974653446a22..5697d1301b8a 100644 --- a/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py +++ b/caffe2/contrib/playground/resnetdemo/caffe2_resnet50_default_param_update.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def gen_param_update_builder_fun(self, model, dataset, is_train): diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py index 01b51fa8450c..056ddd8c9ea0 100644 --- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py +++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_forward.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging logging.basicConfig() diff --git a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py index 8a86289778ee..5378acd61886 100644 --- a/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py +++ b/caffe2/contrib/playground/resnetdemo/explicit_resnet_param_update.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core from caffe2.proto import caffe2_pb2 diff --git a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py index 8b2647114b63..496ac22ffde5 100644 --- a/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py +++ b/caffe2/contrib/playground/resnetdemo/gfs_IN1k.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + # # example1 using gfs as input source. diff --git a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py index 4cc2d68cbfd7..419d6a25e95b 100644 --- a/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py +++ b/caffe2/contrib/playground/resnetdemo/override_no_test_model_no_checkpoint.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def checkpoint(self, epoch): self.model_path = None diff --git a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py index d757896793ff..0a56d68257ee 100644 --- a/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py +++ b/caffe2/contrib/playground/resnetdemo/rendezvous_filestore.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python import dyndep diff --git a/caffe2/contrib/prof/cuda_profile_ops_test.py b/caffe2/contrib/prof/cuda_profile_ops_test.py index 2953503bbea5..c77b7ae88ba6 100644 --- a/caffe2/contrib/prof/cuda_profile_ops_test.py +++ b/caffe2/contrib/prof/cuda_profile_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.proto import caffe2_pb2 diff --git a/caffe2/contrib/tensorboard/tensorboard.py b/caffe2/contrib/tensorboard/tensorboard.py index 9aece77bc09a..6f5ad1896e35 100644 --- a/caffe2/contrib/tensorboard/tensorboard.py +++ b/caffe2/contrib/tensorboard/tensorboard.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import click import collections diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter.py b/caffe2/contrib/tensorboard/tensorboard_exporter.py index a3c0e0e59723..ef12ce563cde 100644 --- a/caffe2/contrib/tensorboard/tensorboard_exporter.py +++ b/caffe2/contrib/tensorboard/tensorboard_exporter.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from builtins import bytes import copy diff --git a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py index 6b9c894e16fb..31ef8180fb57 100644 --- a/caffe2/contrib/tensorboard/tensorboard_exporter_test.py +++ b/caffe2/contrib/tensorboard/tensorboard_exporter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/contrib/tensorboard/tensorboard_test.py b/caffe2/contrib/tensorboard/tensorboard_test.py index 494cb6fc7d12..8751be14ead5 100644 --- a/caffe2/contrib/tensorboard/tensorboard_test.py +++ b/caffe2/contrib/tensorboard/tensorboard_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import click.testing import numpy as np diff --git a/caffe2/contrib/warpctc/ctc_ops_test.py b/caffe2/contrib/warpctc/ctc_ops_test.py index 3b21c8b66747..013e80a98773 100644 --- a/caffe2/contrib/warpctc/ctc_ops_test.py +++ b/caffe2/contrib/warpctc/ctc_ops_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from caffe2.proto import caffe2_pb2 diff --git a/caffe2/core/nomnigraph/op_gen.py b/caffe2/core/nomnigraph/op_gen.py index 49cd2abb2cef..fbe1c8da377e 100755 --- a/caffe2/core/nomnigraph/op_gen.py +++ b/caffe2/core/nomnigraph/op_gen.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse from textwrap import dedent diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h index 18a7be64d670..27f8b471b71b 100644 --- a/caffe2/core/tensor.h +++ b/caffe2/core/tensor.h @@ -70,7 +70,7 @@ class CAFFE2_API Tensor final { explicit Tensor(at::Device device) : impl_(c10::make_intrusive( Storage::create_legacy(device), - c10::computeDispatchKey(at::device(device).layout(at::kStrided)), + c10::computeDispatchKey(c10::nullopt, at::kStrided, device), TypeMeta())) {} /** diff --git a/caffe2/distributed/file_store_handler.cc b/caffe2/distributed/file_store_handler.cc index 5a749c304d2b..5a34e53b6947 100644 --- a/caffe2/distributed/file_store_handler.cc +++ b/caffe2/distributed/file_store_handler.cc @@ -122,6 +122,16 @@ int64_t FileStoreHandler::add( return 0; } +int64_t FileStoreHandler::getNumKeys() { + CHECK(false) << "getNumKeys not implemented for FileStoreHandler"; + return 0; +} + +bool FileStoreHandler::deleteKey(const std::string& /* unused */) { + CHECK(false) << "deleteKey not implemented for FileStoreHandler"; + return false; +} + bool FileStoreHandler::check(const std::vector& names) { std::vector paths; for (const auto& name : names) { diff --git a/caffe2/distributed/file_store_handler.h b/caffe2/distributed/file_store_handler.h index b58b156e51b0..9ca81e4c2c7d 100644 --- a/caffe2/distributed/file_store_handler.h +++ b/caffe2/distributed/file_store_handler.h @@ -17,6 +17,10 @@ class CAFFE2_API FileStoreHandler : public StoreHandler { virtual int64_t add(const std::string& name, int64_t value) override; + virtual bool deleteKey(const std::string& key) override; + + virtual int64_t getNumKeys() override; + virtual bool check(const std::vector& names) override; virtual void wait( diff --git a/caffe2/distributed/file_store_handler_op_test.py b/caffe2/distributed/file_store_handler_op_test.py index 2e90c548d50f..427b68420d39 100644 --- a/caffe2/distributed/file_store_handler_op_test.py +++ b/caffe2/distributed/file_store_handler_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import errno import os diff --git a/caffe2/distributed/redis_store_handler.cc b/caffe2/distributed/redis_store_handler.cc index 7caaa6c79de7..e424c0e719fd 100644 --- a/caffe2/distributed/redis_store_handler.cc +++ b/caffe2/distributed/redis_store_handler.cc @@ -76,6 +76,16 @@ int64_t RedisStoreHandler::add(const std::string& name, int64_t value) { return reply->integer; } +int64_t RedisStoreHandler::getNumKeys() { + CHECK(false) << "getNumKeys not implemented for RedisStoreHandler"; + return 0; +} + +bool RedisStoreHandler::deleteKey(const std::string& /* unused */) { + CHECK(false) << "deleteKey not implemented for RedisStoreHandler"; + return false; +} + bool RedisStoreHandler::check(const std::vector& names) { std::vector args; args.push_back("EXISTS"); diff --git a/caffe2/distributed/redis_store_handler.h b/caffe2/distributed/redis_store_handler.h index 0caa888a6629..d5fa76741578 100644 --- a/caffe2/distributed/redis_store_handler.h +++ b/caffe2/distributed/redis_store_handler.h @@ -23,6 +23,10 @@ class CAFFE2_API RedisStoreHandler : public StoreHandler { virtual int64_t add(const std::string& name, int64_t value) override; + virtual int64_t getNumKeys() override; + + virtual bool deleteKey(const std::string& key) override; + virtual bool check(const std::vector& names) override; virtual void wait( diff --git a/caffe2/distributed/redis_store_handler_op_test.py b/caffe2/distributed/redis_store_handler_op_test.py index 3df69bf2701a..8f5d58e85185 100644 --- a/caffe2/distributed/redis_store_handler_op_test.py +++ b/caffe2/distributed/redis_store_handler_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import uuid diff --git a/caffe2/distributed/store_handler.h b/caffe2/distributed/store_handler.h index e11ea57aea3d..951fe26c6ec6 100644 --- a/caffe2/distributed/store_handler.h +++ b/caffe2/distributed/store_handler.h @@ -41,6 +41,16 @@ class CAFFE2_API StoreHandler { */ virtual int64_t add(const std::string& name, int64_t value) = 0; + /* + * Returns the number of keys in this store. + */ + virtual int64_t getNumKeys() = 0; + + /* + * Removes the specified key from the store. + */ + virtual bool deleteKey(const std::string& key) = 0; + /* * Check if a keys exist in the store. */ diff --git a/caffe2/distributed/store_ops_test_util.py b/caffe2/distributed/store_ops_test_util.py index 2abe697cface..05245be9b210 100644 --- a/caffe2/distributed/store_ops_test_util.py +++ b/caffe2/distributed/store_ops_test_util.py @@ -1,9 +1,9 @@ ## @package store_ops_test_util # Module caffe2.distributed.store_ops_test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from multiprocessing import Process, Queue diff --git a/caffe2/experiments/python/SparseTransformer.py b/caffe2/experiments/python/SparseTransformer.py index ff9ab7715c33..d97f076a7bb3 100644 --- a/caffe2/experiments/python/SparseTransformer.py +++ b/caffe2/experiments/python/SparseTransformer.py @@ -15,10 +15,10 @@ ## @package SparseTransformer # Module caffe2.experiments.python.SparseTransformer -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace import scipy.sparse diff --git a/caffe2/experiments/python/convnet_benchmarks.py b/caffe2/experiments/python/convnet_benchmarks.py index 386c9c4b7ebc..ff9b7a20bc73 100644 --- a/caffe2/experiments/python/convnet_benchmarks.py +++ b/caffe2/experiments/python/convnet_benchmarks.py @@ -15,10 +15,10 @@ ## @package convnet_benchmarks # Module caffe2.experiments.python.convnet_benchmarks -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + """ Benchmark for common convnets. diff --git a/caffe2/experiments/python/device_reduce_sum_bench.py b/caffe2/experiments/python/device_reduce_sum_bench.py index dbe0dae4f0c2..1a795e2fcf0e 100644 --- a/caffe2/experiments/python/device_reduce_sum_bench.py +++ b/caffe2/experiments/python/device_reduce_sum_bench.py @@ -15,10 +15,10 @@ ## @package device_reduce_sum_bench # Module caffe2.experiments.python.device_reduce_sum_bench -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import itertools diff --git a/caffe2/experiments/python/funhash_op_test.py b/caffe2/experiments/python/funhash_op_test.py index 6a4eb0e6b5b5..3fc4c8bf54fd 100644 --- a/caffe2/experiments/python/funhash_op_test.py +++ b/caffe2/experiments/python/funhash_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/experiments/python/net_construct_bench.py b/caffe2/experiments/python/net_construct_bench.py index b7cf605c0c04..ec12517c03be 100644 --- a/caffe2/experiments/python/net_construct_bench.py +++ b/caffe2/experiments/python/net_construct_bench.py @@ -15,10 +15,10 @@ ## @package net_construct_bench # Module caffe2.experiments.python.net_construct_bench -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import logging diff --git a/caffe2/experiments/python/sparse_funhash_op_test.py b/caffe2/experiments/python/sparse_funhash_op_test.py index 2af006249c7d..cfc7a0bb6165 100644 --- a/caffe2/experiments/python/sparse_funhash_op_test.py +++ b/caffe2/experiments/python/sparse_funhash_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/experiments/python/sparse_reshape_op_test.py b/caffe2/experiments/python/sparse_reshape_op_test.py index 5849580f09e1..a22bf561ce86 100644 --- a/caffe2/experiments/python/sparse_reshape_op_test.py +++ b/caffe2/experiments/python/sparse_reshape_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/experiments/python/tt_contraction_op_test.py b/caffe2/experiments/python/tt_contraction_op_test.py index 4cd04a16ea23..1e41e9ed8ddd 100644 --- a/caffe2/experiments/python/tt_contraction_op_test.py +++ b/caffe2/experiments/python/tt_contraction_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/experiments/python/tt_pad_op_test.py b/caffe2/experiments/python/tt_pad_op_test.py index 10be7adcb453..27d13543348b 100644 --- a/caffe2/experiments/python/tt_pad_op_test.py +++ b/caffe2/experiments/python/tt_pad_op_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/operators/bucketize_op.cu b/caffe2/operators/bucketize_op.cu index 1d48013e771d..5d3049f239fb 100644 --- a/caffe2/operators/bucketize_op.cu +++ b/caffe2/operators/bucketize_op.cu @@ -15,7 +15,7 @@ __global__ void BucketizeOpKernel( CUDA_1D_KERNEL_LOOP(i, N) { int32_t low = -1, high = M; while (high - low > 1) { - int32_t median = (high + low) / 2; + const int32_t median = low + (high - low) / 2; if (bounds[median] < X[i]) { low = median; } else { diff --git a/caffe2/operators/gather_ranges_to_dense_op.cc b/caffe2/operators/gather_ranges_to_dense_op.cc index 10396aafc97e..aa31ef12b36a 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.cc +++ b/caffe2/operators/gather_ranges_to_dense_op.cc @@ -104,3 +104,11 @@ NO_GRADIENT(GatherRangesToDense); } // namespace } // namespace caffe2 + +using GatherRangesToDenseCPUOp = + caffe2::GatherRangesToDenseOp; + +C10_EXPORT_CAFFE2_OP_TO_C10_CPU( + GatherRangesToDense, + "_caffe2::GatherRangesToDense(Tensor data, Tensor ranges, Tensor? key, int[] lengths, int min_observation, float max_mismatched_ratio, float max_empty_ratio) -> Tensor[] outputs", + GatherRangesToDenseCPUOp); diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index c1dd5a527005..217a61b25129 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -5,6 +5,7 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" +#include "caffe2/core/export_caffe2_op_to_c10.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -15,6 +16,8 @@ #include #include +C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesToDense); + namespace caffe2 { template class GatherRangesToDenseOp final : public Operator { diff --git a/caffe2/operators/layer_norm_op.h b/caffe2/operators/layer_norm_op.h index e1e8ec0693d6..543ad8dd0b34 100644 --- a/caffe2/operators/layer_norm_op.h +++ b/caffe2/operators/layer_norm_op.h @@ -52,6 +52,11 @@ class LayerNormOp final : public Operator { T* sigma_data = sigma->template mutable_data(); T* scale_data = scale_.template mutable_data(); T* bias_data = bias_.template mutable_data(); + + if (M == 0) { + return true; + } + const std::array X_dims = {M, N}; const std::array Y_dims = {M, 1}; math::Moments( @@ -174,6 +179,16 @@ class LayerNormGradientOp final : public Operator { g_scale_data = g_scale_.template mutable_data(); } + if (M == 0) { + if (N > 0 && dgamma_data != nullptr) { + math::Set(N, T(0), dgamma_data, &context_); + } + if (N > 0 && dbeta_data != nullptr) { + math::Set(N, T(0), dbeta_data, &context_); + } + return true; + } + ComputeInternalGradients( M, N, dY_data, X_data, gamma_data, dX_data, ds_data, db_data); ComputeFusedParams( diff --git a/caffe2/operators/mean_op.h b/caffe2/operators/mean_op.h index f16914f4a894..beb0b0440505 100644 --- a/caffe2/operators/mean_op.h +++ b/caffe2/operators/mean_op.h @@ -65,9 +65,11 @@ class MeanOp final : public Operator { bool RunOnDevice() override { if (Input(0).template IsType()) { return DoRunWithType(); + } else if (Input(0).template IsType()) { + return DoRunWithType(); } else { CAFFE_THROW( - "Mean operator only supports 32-bit float, but", + "Mean operator only supports 32-bit float or 64-bit double, but", " input was of type ", Input(0).dtype().name()); } @@ -111,9 +113,11 @@ class MeanGradientOp : public Operator { bool RunOnDevice() override { if (Input(0).template IsType()) { return DoRunWithType(); + } else if (Input(0).template IsType()) { + return DoRunWithType(); } else { CAFFE_THROW( - "Mean operator only supports 32-bit float, but", + "Mean operator only supports 32-bit float or 64-bit double, but", " input was of type ", Input(0).dtype().name()); } diff --git a/caffe2/operators/roi_align_gradient_op.cc b/caffe2/operators/roi_align_gradient_op.cc index 7f3b1155e1b3..6a9b2bab0ec3 100644 --- a/caffe2/operators/roi_align_gradient_op.cc +++ b/caffe2/operators/roi_align_gradient_op.cc @@ -191,7 +191,7 @@ void ROIAlignBackwardFeature( } // namespace template <> -bool RoIAlignGradientOp::RunOnDevice() { +C10_EXPORT bool RoIAlignGradientOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op diff --git a/caffe2/operators/roi_align_gradient_op.cu b/caffe2/operators/roi_align_gradient_op.cu index babf06d759eb..09f56e3269e7 100644 --- a/caffe2/operators/roi_align_gradient_op.cu +++ b/caffe2/operators/roi_align_gradient_op.cu @@ -190,7 +190,7 @@ __global__ void RoIAlignBackwardFeature( } // namespace template <> -bool RoIAlignGradientOp::RunOnDevice() { +C10_EXPORT bool RoIAlignGradientOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op diff --git a/caffe2/operators/roi_align_op.cc b/caffe2/operators/roi_align_op.cc index 997eb1404b2e..55cbb47be81c 100644 --- a/caffe2/operators/roi_align_op.cc +++ b/caffe2/operators/roi_align_op.cc @@ -84,7 +84,7 @@ std::vector> MakeBilinearInterpolationParams( } // namespace template <> -bool RoIAlignOp::RunOnDeviceWithOrderNCHW( +C10_EXPORT bool RoIAlignOp::RunOnDeviceWithOrderNCHW( int64_t N, int64_t C, int64_t H, @@ -170,7 +170,7 @@ bool RoIAlignOp::RunOnDeviceWithOrderNCHW( } template <> -bool RoIAlignOp::RunOnDeviceWithOrderNHWC( +C10_EXPORT bool RoIAlignOp::RunOnDeviceWithOrderNHWC( int64_t N, int64_t C, int64_t H, diff --git a/caffe2/operators/roi_align_op.cu b/caffe2/operators/roi_align_op.cu index 62d7842e2ae3..4d0edd3a408c 100644 --- a/caffe2/operators/roi_align_op.cu +++ b/caffe2/operators/roi_align_op.cu @@ -149,7 +149,7 @@ __global__ void RoIAlignForward( } // namespace template <> -bool RoIAlignOp::RunOnDevice() { +C10_EXPORT bool RoIAlignOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs // RoI pooled data diff --git a/caffe2/operators/roi_align_rotated_gradient_op.cu b/caffe2/operators/roi_align_rotated_gradient_op.cu index 1ca0b73c72fa..cc16a828858f 100644 --- a/caffe2/operators/roi_align_rotated_gradient_op.cu +++ b/caffe2/operators/roi_align_rotated_gradient_op.cu @@ -198,7 +198,7 @@ __global__ void RoIAlignRotatedBackward( } // namespace template <> -bool RoIAlignRotatedGradientOp::RunOnDevice() { +C10_EXPORT bool RoIAlignRotatedGradientOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs auto& dY = Input(2); // Gradient of net w.r.t. output of "forward" op diff --git a/caffe2/operators/roi_align_rotated_op.cc b/caffe2/operators/roi_align_rotated_op.cc index c94d0f11bd1f..73464f1fe6ee 100644 --- a/caffe2/operators/roi_align_rotated_op.cc +++ b/caffe2/operators/roi_align_rotated_op.cc @@ -291,7 +291,7 @@ void ROIAlignRotatedForward( } // namespace template <> -bool RoIAlignRotatedOp::RunOnDevice() { +C10_EXPORT bool RoIAlignRotatedOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs diff --git a/caffe2/operators/roi_align_rotated_op.cu b/caffe2/operators/roi_align_rotated_op.cu index 96e4797c597c..67c1d38f51b4 100644 --- a/caffe2/operators/roi_align_rotated_op.cu +++ b/caffe2/operators/roi_align_rotated_op.cu @@ -158,7 +158,7 @@ __global__ void RoIAlignRotatedForward( } // namespace template <> -bool RoIAlignRotatedOp::RunOnDevice() { +C10_EXPORT bool RoIAlignRotatedOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs diff --git a/caffe2/operators/roi_pool_op.cc b/caffe2/operators/roi_pool_op.cc index 95a6cbfa386c..d0018b03f4a6 100644 --- a/caffe2/operators/roi_pool_op.cc +++ b/caffe2/operators/roi_pool_op.cc @@ -8,7 +8,7 @@ using std::max; using std::min; template <> -bool RoIPoolOp::RunOnDevice() { +C10_EXPORT bool RoIPoolOp::RunOnDevice() { const auto& X = Input(0); // Input data to pool const auto& R = Input(1); // RoIs auto* Y = Output(0); // RoI pooled data diff --git a/caffe2/operators/roi_pool_op.cu b/caffe2/operators/roi_pool_op.cu index af479f8a5881..7c1ef1316623 100644 --- a/caffe2/operators/roi_pool_op.cu +++ b/caffe2/operators/roi_pool_op.cu @@ -167,7 +167,7 @@ bool RoIPoolOp::RunOnDevice() { } template <> -bool RoIPoolGradientOp::RunOnDevice() { +C10_EXPORT bool RoIPoolGradientOp::RunOnDevice() { auto& X = Input(0); // Input data to pool auto& R = Input(1); // RoIs auto& A = Input(2); // argmaxes diff --git a/caffe2/operators/sparse_to_dense_mask_op.cc b/caffe2/operators/sparse_to_dense_mask_op.cc index d968112c9ecc..b842d09e068d 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.cc +++ b/caffe2/operators/sparse_to_dense_mask_op.cc @@ -45,21 +45,21 @@ Convert sparse representations to dense with given indices. Transforms a sparse representation of map represented as `indices` vector and `values` tensor into a compacted tensor where the first dimension -corresponds to each id provided in mask argument. Missing values are filled with -the value of `default_value`. After running this op: +corresponds to each id provided in the mask argument. Missing values are filled +with the value of `default_value`. After running this op: output[j, :] = values[i] // where mask[j] == indices[i] output[j, ...] = default_value // when mask[j] doesn't appear in indices -If `lengths` is provided and not empty, and extra "batch" dimension is prepended +If `lengths` is provided and not empty, an extra "batch" dimension is prepended to the output. -`values` and `default_value` can have additional matching dimensions, operation -is performed on the entire subtensor in thise case. +`values` and `default_value` can have additional matching dimensions +(the operation is performed on the entire subtensor in this case). -For example, if `lengths` is supplied and `values` is 1-D vector of floats and -`default_value` is a float scalar, the output is going to be a float matrix -of size `len(lengths) X len(mask)` +For example, if `lengths` is supplied and `values` is a 1-D vector of floats +and `default_value` is a float scalar, the output is going to be a float +matrix of size `len(lengths) X len(mask)`. )DOC") .Arg( "mask", @@ -67,6 +67,10 @@ of size `len(lengths) X len(mask)` .Arg( "return_presence_mask", "bool whether to return presence mask, false by default") + .Arg( + "max_skipped_indices", + "int argument representing the maximum number of invalid row ids that " + "can be skipped before returning an error. 50 by default") .Input(0, "indices", "1-D int32/int64 tensor of concatenated ids of data") .Input(1, "values", "Data tensor, first dimension has to match `indices`") .Input( @@ -117,3 +121,18 @@ class GetSparseToDenseMaskGradient : public GradientMakerBase { REGISTER_GRADIENT(SparseToDenseMask, GetSparseToDenseMaskGradient); } // namespace } // namespace caffe2 + +// clang-format off +C10_EXPORT_CAFFE2_OP_TO_C10_CPU( + SparseToDenseMask, + "_caffe2::SparseToDenseMask(" + "Tensor indices, " + "Tensor values, " + "Tensor default_value, " + "Tensor? lengths, " + "int[] mask, " + "bool? return_presence_mask = False, " + "int? max_skipped_indices = 50" + ") -> (Tensor output, Tensor presence_mask)", + caffe2::SparseToDenseMaskOp); +// clang-format on diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h index 8ed589c6d734..26213c0cff33 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.h +++ b/caffe2/operators/sparse_to_dense_mask_op.h @@ -5,10 +5,13 @@ #include #include #include "caffe2/core/context.h" +#include "caffe2/core/export_caffe2_op_to_c10.h" #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" #include "caffe2/utils/math.h" +C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(SparseToDenseMask); + namespace caffe2 { template diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc index d37717d5b957..d8fe956a0ddd 100644 --- a/caffe2/opt/bound_shape_inferencer.cc +++ b/caffe2/opt/bound_shape_inferencer.cc @@ -857,7 +857,8 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) { try { const static std::unordered_set types_with_independent_output_shape = {"Int8GenQuantParams", - "Int8QuantSchemeBlobFill"}; + "Int8QuantSchemeBlobFill", + "ComputeEqualizationScale"}; std::vector input_shapes; for (const auto& input : op.input()) { const auto it = shape_info_.find(input); @@ -883,6 +884,7 @@ void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) { bool is_quantized = !(op.type().compare(0, 4, "Int8")) && (op.type() != "Int8Dequantize") && (op.type() != "Int8QuantSchemeBlobFill") && + (op.type() != "ComputeEqualizationScale") && (op.type() != "Int8GenQuantParams"); float scale = 1; int offset = 0; diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h index 6a211a604d52..f19403a14e58 100644 --- a/caffe2/opt/onnxifi_op.h +++ b/caffe2/opt/onnxifi_op.h @@ -263,10 +263,13 @@ class OnnxifiOp final : public Operator { defered_blob_reader = ws->GetBlob("__DEFERRED_BLOB_READER__"); } onnxGraph graph{nullptr}; + + static const uint64_t auxPropertiesListAOT[] = { + ONNXIFI_OPTIMIZATION_AOT, ONNXIFI_GRAPH_PROPERTY_NONE}; CAFFE_ENFORCE_EQ( lib_->onnxInitGraph( backend, - nullptr, + use_glow_aot_ ? auxPropertiesListAOT : nullptr, onnx_model_str.size(), (const void*)(onnx_model_str.c_str()), weight_descs.size(), diff --git a/caffe2/opt/shape_info.cc b/caffe2/opt/shape_info.cc index 0ff55693395f..dfcdeb0356bd 100644 --- a/caffe2/opt/shape_info.cc +++ b/caffe2/opt/shape_info.cc @@ -5,6 +5,63 @@ namespace caffe2 { +namespace { +bool isNumber(const std::string& s) { + bool empty = true; + for (const char c : s) { + if (std::isalpha(c)) { + return false; + } + if (!std::isspace(c)) { + empty = false; + } + } + return !empty; +} + +std::string toLower(const std::string& s) { + std::string t; + t.resize(s.size()); + for (size_t i = 0; i < t.size(); i++) { + t[i] = std::tolower(s[i]); + } + return t; +} + +TensorProto_DataType toTensorProtoDataType(const std::string& in) { + std::string s = toLower(in); + if (s == "uint8") { + return TensorProto_DataType_UINT8; + } else if (s == "int8") { + return TensorProto_DataType_INT8; + } else if (s == "uint16") { + return TensorProto_DataType_UINT16; + } else if (s == "int16") { + return TensorProto_DataType_INT16; + } else if (s == "int32") { + return TensorProto_DataType_INT32; + } else if (s == "int64") { + return TensorProto_DataType_INT64; + } else if (s == "float16" || s == "half") { + return TensorProto_DataType_FLOAT16; + } else if (s == "float") { + return TensorProto_DataType_FLOAT; + } else if (s == "double") { + return TensorProto_DataType_DOUBLE; + } else if (s == "byte") { + return TensorProto_DataType_BYTE; + } else if (s == "string") { + return TensorProto_DataType_STRING; + } else if (s == "bool") { + return TensorProto_DataType_BOOL; + } else if (s == "hash") { + return TensorProto_DataType_ZERO_COLLISION_HASH; + } + // return default data type, float + return TensorProto_DataType_FLOAT; +} +} // namespace + ShapeInfo getShapeInfoFromBlob(const Blob* blob) { ShapeInfo shape_info; shape_info.shape = GetTensorShapeOfBlob(blob); @@ -138,14 +195,24 @@ void parseShapeInfoMapFromString( const auto& name = kv[0]; TensorShape shape; - if (name.find("int8") != std::string::npos) { - shape.set_data_type(TensorProto_DataType_UINT8); + size_t size = kv.size(); + CAFFE_ENFORCE_GT(size, 1); + if (!isNumber(kv[size - 1])) { + // last value is the type + shape.set_data_type(toTensorProtoDataType(kv[size - 1])); + size--; } else { - shape.set_data_type(TensorProto_DataType_FLOAT); + if (name.find("int8") != std::string::npos) { + // Kept for backwards compatibility. + // Set type explicitly to overwrite it. + shape.set_data_type(TensorProto_DataType_UINT8); + } else { + shape.set_data_type(TensorProto_DataType_FLOAT); + } } bool valid = true; - for (int i = 1; i < kv.size(); i++) { + for (int i = 1; i < size; i++) { auto dim = kv[i]; try { shape.add_dims(std::stoi(dim)); diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc index 528bbee3c2ca..35b9605021e6 100644 --- a/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc +++ b/caffe2/perfkernels/fused_nbit_rowwise_conversion.cc @@ -6,6 +6,10 @@ #include "common.h" +#ifdef USE_FBGEMM +#include "fbgemm/QuantUtils.h" +#endif + namespace caffe2 { void FloatToFused8BitRowwiseQuantized__base( @@ -58,46 +62,32 @@ void Fused8BitRowwiseQuantizedToFloat__base( } } -decltype(FloatToFused8BitRowwiseQuantized__base) - FloatToFused8BitRowwiseQuantized__avx2_fma; void FloatToFused8BitRowwiseQuantized( const float* input, int input_rows, int input_columns, std::uint8_t* output) { - AVX2_FMA_DO( - FloatToFused8BitRowwiseQuantized, - input, - input_rows, - input_columns, - output); - BASE_DO( - FloatToFused8BitRowwiseQuantized, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::FloatToFused8BitRowwiseQuantizedSBFloat( + input, input_rows, input_columns, output); +#else + FloatToFused8BitRowwiseQuantized__base( + input, input_rows, input_columns, output); +#endif } -decltype(Fused8BitRowwiseQuantizedToFloat__base) - Fused8BitRowwiseQuantizedToFloat__avx2_fma; void Fused8BitRowwiseQuantizedToFloat( const std::uint8_t* input, int input_rows, int input_columns, float* output) { - AVX2_FMA_DO( - Fused8BitRowwiseQuantizedToFloat, - input, - input_rows, - input_columns, - output); - BASE_DO( - Fused8BitRowwiseQuantizedToFloat, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloat( + input, input_rows, input_columns, output); +#else + Fused8BitRowwiseQuantizedToFloat__base( + input, input_rows, input_columns, output); +#endif } void FloatToFusedNBitRowwiseQuantizedSBHalf__base( @@ -184,52 +174,34 @@ void FusedNBitRowwiseQuantizedSBHalfToFloat__base( } } -decltype(FloatToFusedNBitRowwiseQuantizedSBHalf__base) - FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma; void FloatToFusedNBitRowwiseQuantizedSBHalf( int bit_rate, const float* input, int input_rows, int input_columns, std::uint8_t* output) { - AVX2_FMA_DO( - FloatToFusedNBitRowwiseQuantizedSBHalf, - bit_rate, - input, - input_rows, - input_columns, - output); - BASE_DO( - FloatToFusedNBitRowwiseQuantizedSBHalf, - bit_rate, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::FloatToFusedNBitRowwiseQuantizedSBHalf( + bit_rate, input, input_rows, input_columns, output); +#else + FloatToFusedNBitRowwiseQuantizedSBHalf__base( + bit_rate, input, input_rows, input_columns, output); +#endif } -decltype(FusedNBitRowwiseQuantizedSBHalfToFloat__base) - FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma; void FusedNBitRowwiseQuantizedSBHalfToFloat( int bit_rate, const std::uint8_t* input, int input_rows, int input_columns, float* output) { - AVX2_FMA_DO( - FusedNBitRowwiseQuantizedSBHalfToFloat, - bit_rate, - input, - input_rows, - input_columns, - output); - BASE_DO( - FusedNBitRowwiseQuantizedSBHalfToFloat, - bit_rate, - input, - input_rows, - input_columns, - output); +#ifdef USE_FBGEMM + fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloat( + bit_rate, input, input_rows, input_columns, output); +#else + FusedNBitRowwiseQuantizedSBHalfToFloat__base( + bit_rate, input, input_rows, input_columns, output); +#endif } } // namespace caffe2 diff --git a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc b/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc deleted file mode 100644 index e7053b5136c0..000000000000 --- a/caffe2/perfkernels/fused_nbit_rowwise_conversion_avx2.cc +++ /dev/null @@ -1,534 +0,0 @@ -#include "./fused_nbit_rowwise_conversion.h" - -#include -#include -#include // for FLT_MAX -#include - -#include "./cvtsh_ss_bugfix.h" - -namespace caffe2 { - -constexpr int VLEN = 8; - -void FloatToFused8BitRowwiseQuantized__avx2_fma( - const float* input, - int input_rows, - int input_columns, - std::uint8_t* output) { - constexpr float kEpsilon = 1e-8f; - - __m256i permute_mask1_v = - _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); - __m256i shuffle_mask_v = _mm256_set_epi8( - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0c, - 0x08, - 0x04, - 0x00, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0xff, - 0x0c, - 0x08, - 0x04, - 0x00); - __m256i permute_mask2_v = - _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); - - int output_columns = input_columns + 2 * sizeof(float); - for (std::size_t row = 0; row < input_rows; ++row) { - const float* input_row = input + row * input_columns; - std::uint8_t* output_row = output + row * output_columns; - float* output_row_scale_bias = - reinterpret_cast(output_row + input_columns); - - float minimum_element = FLT_MAX; - float maximum_element = -FLT_MAX; - __m256 min_v = _mm256_set1_ps(minimum_element); - __m256 max_v = _mm256_set1_ps(maximum_element); - std::size_t col; - for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) { - __m256 in_v = _mm256_loadu_ps(input_row + col); - min_v = _mm256_min_ps(min_v, in_v); - max_v = _mm256_max_ps(max_v, in_v); - } - alignas(64) float min_buf[VLEN], max_buf[VLEN]; - _mm256_store_ps(min_buf, min_v); - _mm256_store_ps(max_buf, max_v); - for (int i = 0; i < VLEN; ++i) { - minimum_element = std::min(minimum_element, min_buf[i]); - maximum_element = std::max(maximum_element, max_buf[i]); - } - for (; col < input_columns; ++col) { - minimum_element = std::min(minimum_element, input_row[col]); - maximum_element = std::max(maximum_element, input_row[col]); - } - - float range = maximum_element - minimum_element; - - output_row_scale_bias[0] = range / 255.0f; - output_row_scale_bias[1] = minimum_element; - const auto inverse_scale = 255.0f / (range + kEpsilon); - min_v = _mm256_set1_ps(minimum_element); - __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); - - for (col = 0; col < input_columns / (4 * VLEN) * (4 * VLEN); - col += 4 * VLEN) { - __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v), - inverse_scale_v)); - __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v), - inverse_scale_v)); - __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v), - inverse_scale_v)); - __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v), - inverse_scale_v)); - - // An instruction sequence to save 32 32-bit integers as 8-bit integers - __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v); - __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v); - __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v); - xyzw_packed_v = - _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v); - _mm256_storeu_si256( - reinterpret_cast<__m256i*>(output_row + col), xyzw_packed_v); - } - for (; col < input_columns / VLEN * VLEN; col += VLEN) { - __m256i rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v), - inverse_scale_v)); - - // An instruction sequence to save 8 32-bit integers as 8-bit integers - rounded_v = _mm256_shuffle_epi8(rounded_v, shuffle_mask_v); - rounded_v = _mm256_permutevar8x32_epi32(rounded_v, permute_mask2_v); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(output_row + col), - _mm256_castsi256_si128(rounded_v)); - } - for (; col < input_columns; ++col) { - output_row[col] = - std::lrintf((input_row[col] - minimum_element) * inverse_scale); - } - } -} - -void Fused8BitRowwiseQuantizedToFloat__avx2_fma( - const std::uint8_t* input, - int input_rows, - int input_columns, - float* output) { - int output_columns = input_columns - 2 * sizeof(float); - - for (std::size_t row = 0; row < input_rows; ++row) { - const std::uint8_t* input_row = input + row * input_columns; - const float* input_row_scale_bias = - reinterpret_cast(input_row + output_columns); - float* output_row = output + row * output_columns; - - __m256 scale_v = _mm256_set1_ps(input_row_scale_bias[0]); - __m256 bias_v = _mm256_set1_ps(input_row_scale_bias[1]); - - std::size_t col; - for (col = 0; col < output_columns / VLEN * VLEN; col += VLEN) { - __m256 in_v = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32( - _mm_loadl_epi64(reinterpret_cast(input_row + col)))); - _mm256_storeu_ps( - output_row + col, - _mm256_add_ps(_mm256_mul_ps(in_v, scale_v), bias_v)); - } - - for (; col < output_columns; ++col) { - output_row[col] = - input_row[col] * input_row_scale_bias[0] + input_row_scale_bias[1]; - } - } -} - -namespace { - -template -void FloatToFusedNBitRowwiseQuantizedSBHalf_( - const float* input, - int input_rows, - int input_columns, - std::uint8_t* output) { - __m256i permute_mask1_v = - _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); - - int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; - int output_columns = - (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE + - 2 * sizeof(std::uint16_t); - for (std::size_t row = 0; row < input_rows; ++row) { - const float* input_row = input + row * input_columns; - std::uint8_t* output_row = output + row * output_columns; - std::uint16_t* output_row_scale_bias = reinterpret_cast( - output_row + - (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); - - float minimum_element = FLT_MAX; - float maximum_element = -FLT_MAX; - __m256 min_v = _mm256_set1_ps(minimum_element); - __m256 max_v = _mm256_set1_ps(maximum_element); - std::size_t col; - for (col = 0; col < input_columns / VLEN * VLEN; col += VLEN) { - __m256 in_v = _mm256_loadu_ps(input_row + col); - min_v = _mm256_min_ps(min_v, in_v); - max_v = _mm256_max_ps(max_v, in_v); - } - alignas(64) float min_buf[VLEN], max_buf[VLEN]; - _mm256_store_ps(min_buf, min_v); - _mm256_store_ps(max_buf, max_v); - for (int i = 0; i < VLEN; ++i) { - minimum_element = std::min(minimum_element, min_buf[i]); - maximum_element = std::max(maximum_element, max_buf[i]); - } - for (; col < input_columns; ++col) { - minimum_element = std::min(minimum_element, input_row[col]); - maximum_element = std::max(maximum_element, input_row[col]); - } - - output_row_scale_bias[1] = _cvtss_sh( - minimum_element, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - minimum_element = _cvtsh_ss(output_row_scale_bias[1]); - const float range = maximum_element - minimum_element; - - float scale = range == 0 ? 1.0f : range / ((1 << BIT_RATE) - 1); - std::uint16_t scale_fp16 = - _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - scale = _cvtsh_ss(scale_fp16); - if (scale == 0) { - // Corner case handling when maximum_element == minimum_element - // Any scale would work because maximum_element - minimum_element will be - // 0 for all X - scale = 1.0f; - } - float inverse_scale = 1.0f / scale; - if (std::isinf(inverse_scale)) { - scale = 1.0f; - inverse_scale = 1.0f; - } - - output_row_scale_bias[0] = - _cvtss_sh(scale, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - - __m256 inverse_scale_v = _mm256_set1_ps(inverse_scale); - min_v = _mm256_set1_ps(minimum_element); - - col = 0; - - if (BIT_RATE == 2 || BIT_RATE == 4) { - for (; col + 4 * VLEN <= input_columns; col += 4 * VLEN) { - __m256i x_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col), min_v), - inverse_scale_v)); - __m256i y_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + VLEN), min_v), - inverse_scale_v)); - __m256i z_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 2 * VLEN), min_v), - inverse_scale_v)); - __m256i w_rounded_v = _mm256_cvtps_epi32(_mm256_mul_ps( - _mm256_sub_ps(_mm256_loadu_ps(input_row + col + 3 * VLEN), min_v), - inverse_scale_v)); - - // An instruction sequence to save 32 32-bit integers as 8-bit integers - __m256i xy_packed_v = _mm256_packs_epi32(x_rounded_v, y_rounded_v); - __m256i zw_packed_v = _mm256_packs_epi32(z_rounded_v, w_rounded_v); - __m256i xyzw_packed_v = _mm256_packus_epi16(xy_packed_v, zw_packed_v); - xyzw_packed_v = - _mm256_permutevar8x32_epi32(xyzw_packed_v, permute_mask1_v); - - // saturate to BIT_RATE - xyzw_packed_v = _mm256_min_epu8( - xyzw_packed_v, - _mm256_set1_epi8(static_cast((1 << BIT_RATE) - 1))); - - if (BIT_RATE == 4) { - // pack into lower 8-bit of each 16-bit - xyzw_packed_v = _mm256_and_si256( - _mm256_or_si256( - xyzw_packed_v, _mm256_srli_epi16(xyzw_packed_v, 4)), - _mm256_set1_epi16(0x00ff)); - } else { - // pack into lower 8-bit of each 32-bit - xyzw_packed_v = _mm256_and_si256( - _mm256_or_si256( - _mm256_or_si256( - xyzw_packed_v, _mm256_srli_epi32(xyzw_packed_v, 6)), - _mm256_or_si256( - _mm256_srli_epi32(xyzw_packed_v, 8 + 4), - _mm256_srli_epi32(xyzw_packed_v, 2 * 8 + 2))), - _mm256_set1_epi32(0x00ff)); - } - - __m128i out_v; - if (BIT_RATE == 4) { - // avx2 doesn't have _mm256_cvtepi16_epi8 - out_v = _mm_packus_epi16( - _mm256_castsi256_si128(xyzw_packed_v), - _mm256_extractf128_si256(xyzw_packed_v, 1)); - _mm_storeu_si128( - reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE), - out_v); - } else { - // avx2 doesn't have _mm256_cvtepi32_epi8 - out_v = _mm_packus_epi32( - _mm256_castsi256_si128(xyzw_packed_v), - _mm256_extractf128_si256(xyzw_packed_v, 1)); - out_v = _mm_packus_epi16(out_v, out_v); - _mm_storel_epi64( - reinterpret_cast<__m128i*>(output_row + col / NUM_ELEM_PER_BYTE), - out_v); - } - } - } - - for (; col < input_columns; ++col) { - float X = input_row[col]; - std::uint8_t quantized = std::max( - 0, - std::min( - std::lrintf((X - minimum_element) * inverse_scale), - (1 << BIT_RATE) - 1)); - if (col % NUM_ELEM_PER_BYTE == 0) { - output_row[col / NUM_ELEM_PER_BYTE] = quantized; - } else { - output_row[col / NUM_ELEM_PER_BYTE] |= - (quantized << ((col % NUM_ELEM_PER_BYTE) * BIT_RATE)); - } - } - } -} - -template -void FusedNBitRowwiseQuantizedSBHalfToFloat_( - const std::uint8_t* input, - int input_rows, - int input_columns, - float* output) { - constexpr int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; - int output_columns = - (input_columns - 2 * sizeof(std::uint16_t)) * NUM_ELEM_PER_BYTE; - - // mask can be accessed by avx2_ps_or_epi32_combined_mask[(8 - remainder) % 8] - static const int avx2_ps_or_epi32_combined_mask[16] = { - -1, - -1, - -1, - -1, - -1, - -1, - -1, - -1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - }; - - // Compute a remainder for vector load - // Since every row is followed by 2 fp16 (scale and bias), luckily - // we don't need mask at bit-rate granularity but just at 32-bit - // granularity. - constexpr int NUM_ELEM_PER_32BIT = 32 / BIT_RATE; - // multiply by 4 because we're handling 4 vlen per iteration - constexpr int NUM_OF_32BIT_PER_VLOAD = VLEN * 4 / NUM_ELEM_PER_32BIT; - int remainder_32bit_granularity = (output_columns + NUM_ELEM_PER_32BIT - 1) / - NUM_ELEM_PER_32BIT % NUM_OF_32BIT_PER_VLOAD; - __m128i vmask_load = _mm_lddqu_si128(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + NUM_OF_32BIT_PER_VLOAD + - (NUM_OF_32BIT_PER_VLOAD - remainder_32bit_granularity) % - NUM_OF_32BIT_PER_VLOAD)); - int remainder = output_columns % (4 * VLEN); - __m256i vmask_store0 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - std::min(output_columns % (4 * VLEN), VLEN) % (VLEN + 1)))); - __m256i vmask_store1 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - - std::max(0, std::min(output_columns % (4 * VLEN) - VLEN, VLEN)) % - (VLEN + 1)))); - __m256i vmask_store2 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - - std::max(0, std::min(output_columns % (4 * VLEN) - 2 * VLEN, VLEN)) % - (VLEN + 1)))); - __m256i vmask_store3 = _mm256_loadu_si256(reinterpret_cast( - avx2_ps_or_epi32_combined_mask + - (VLEN - - std::max(0, std::min(output_columns % (4 * VLEN) - 3 * VLEN, VLEN)) % - (VLEN + 1)))); - - for (std::size_t row = 0; row < input_rows; ++row) { - const std::uint8_t* input_row = input + row * input_columns; - const std::uint16_t* input_row_scale_bias = - reinterpret_cast( - input_row + - (output_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE); - float scale = _cvtsh_ss(input_row_scale_bias[0]); - float bias = _cvtsh_ss(input_row_scale_bias[1]); - float* output_row = output + row * output_columns; - - std::size_t col = 0; - if (BIT_RATE == 4 || BIT_RATE == 2) { - __m256 vscale = _mm256_set1_ps(scale); - __m256 vbias = _mm256_set1_ps(bias); - for (; col + 4 * VLEN <= output_columns; col += 4 * VLEN) { - __m256i vinq; - // unpack to 8-bit integers - if (BIT_RATE == 4) { - vinq = _mm256_cvtepu8_epi16( - _mm_loadu_si128(reinterpret_cast( - input_row + col / NUM_ELEM_PER_BYTE))); - vinq = _mm256_and_si256( - _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)), - _mm256_set1_epi16(0x0f0f)); - } else { - vinq = _mm256_cvtepu8_epi32( - _mm_loadl_epi64(reinterpret_cast( - input_row + col / NUM_ELEM_PER_BYTE))); - vinq = _mm256_and_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_slli_epi32(vinq, 2 * 8 + 2), - _mm256_slli_epi32(vinq, 8 + 4)), - _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)), - _mm256_set1_epi32(0x03030303)); - } - __m256 vinq0 = _mm256_cvtepi32_ps( - _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq))); - __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1)))); - __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2)))); - __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3)))); - vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias); - vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias); - vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias); - vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias); - _mm256_storeu_ps(output_row + col, vinq0); - _mm256_storeu_ps(output_row + col + VLEN, vinq1); - _mm256_storeu_ps(output_row + col + 2 * VLEN, vinq2); - _mm256_storeu_ps(output_row + col + 3 * VLEN, vinq3); - } - - if (remainder) { - __m256i vinq; - if (BIT_RATE == 4) { - vinq = _mm256_cvtepu8_epi16(_mm_maskload_epi32( - reinterpret_cast(input_row + col / NUM_ELEM_PER_BYTE), - vmask_load)); - vinq = _mm256_and_si256( - _mm256_or_si256(vinq, _mm256_slli_epi32(vinq, 4)), - _mm256_set1_epi16(0x0f0f)); - } else { - vinq = _mm256_cvtepu8_epi32(_mm_maskload_epi32( - reinterpret_cast(input_row + col / NUM_ELEM_PER_BYTE), - vmask_load)); - vinq = _mm256_and_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_slli_epi32(vinq, 2 * 8 + 2), - _mm256_slli_epi32(vinq, 8 + 4)), - _mm256_or_si256(_mm256_slli_epi32(vinq, 6), vinq)), - _mm256_set1_epi32(0x03030303)); - } - - __m256 vinq0 = _mm256_cvtepi32_ps( - _mm256_cvtepi8_epi32(_mm256_castsi256_si128(vinq))); - __m256 vinq1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 1)))); - __m256 vinq2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 2)))); - __m256 vinq3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32( - _mm_set1_epi64x(_mm256_extract_epi64(vinq, 3)))); - - vinq0 = _mm256_fmadd_ps(vscale, vinq0, vbias); - vinq1 = _mm256_fmadd_ps(vscale, vinq1, vbias); - vinq2 = _mm256_fmadd_ps(vscale, vinq2, vbias); - vinq3 = _mm256_fmadd_ps(vscale, vinq3, vbias); - - _mm256_maskstore_ps(output_row + col, vmask_store0, vinq0); - _mm256_maskstore_ps(output_row + col + VLEN, vmask_store1, vinq1); - _mm256_maskstore_ps(output_row + col + 2 * VLEN, vmask_store2, vinq2); - _mm256_maskstore_ps(output_row + col + 3 * VLEN, vmask_store3, vinq3); - } - } else { - for (; col < output_columns; ++col) { - std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE]; - quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE; - quantized &= (1 << BIT_RATE) - 1; - output_row[col] = scale * quantized + bias; - } - } - } -} -} // namespace - -void FloatToFusedNBitRowwiseQuantizedSBHalf__avx2_fma( - int bit_rate, - const float* input, - int input_rows, - int input_columns, - std::uint8_t* output) { - if (bit_rate == 2) { - FloatToFusedNBitRowwiseQuantizedSBHalf_<2>( - input, input_rows, input_columns, output); - } else if (bit_rate == 4) { - FloatToFusedNBitRowwiseQuantizedSBHalf_<4>( - input, input_rows, input_columns, output); - } else if (bit_rate == 8) { - FloatToFusedNBitRowwiseQuantizedSBHalf_<8>( - input, input_rows, input_columns, output); - } -} - -void FusedNBitRowwiseQuantizedSBHalfToFloat__avx2_fma( - int bit_rate, - const std::uint8_t* input, - int input_rows, - int input_columns, - float* output) { - if (bit_rate == 2) { - FusedNBitRowwiseQuantizedSBHalfToFloat_<2>( - input, input_rows, input_columns, output); - } else if (bit_rate == 4) { - FusedNBitRowwiseQuantizedSBHalfToFloat_<4>( - input, input_rows, input_columns, output); - } else { - FusedNBitRowwiseQuantizedSBHalfToFloat_<8>( - input, input_rows, input_columns, output); - } -} - -} // namespace caffe2 diff --git a/caffe2/perfkernels/hp_emblookup_codegen.py b/caffe2/perfkernels/hp_emblookup_codegen.py index f79b7c8e7d9c..75b0c8b583be 100644 --- a/caffe2/perfkernels/hp_emblookup_codegen.py +++ b/caffe2/perfkernels/hp_emblookup_codegen.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse import sys diff --git a/caffe2/proto/caffe2_pb2.pyi b/caffe2/proto/caffe2_pb2.pyi new file mode 100644 index 000000000000..060f60fc6c88 --- /dev/null +++ b/caffe2/proto/caffe2_pb2.pyi @@ -0,0 +1,18 @@ + +# Defined in caffe2/proto/caffe2_pb2.h +class DeviceType: + ... + +CPU: DeviceType = ... +CUDA: DeviceType = ... +OPENGL: DeviceType = ... +OPENCL: DeviceType = ... +MKLDNN: DeviceType = ... +IDEEP: DeviceType = ... +HIP: DeviceType = ... + +class NetDef: + ... + +class OperatorDef: + ... \ No newline at end of file diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 09b5652e61f2..8582eff9ce19 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from caffe2.proto import caffe2_pb2 import os import sys diff --git a/caffe2/python/allcompare_test.py b/caffe2/python/allcompare_test.py index 663cc9e02864..22038715f289 100644 --- a/caffe2/python/allcompare_test.py +++ b/caffe2/python/allcompare_test.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py index 73be94feaf2b..59f4a5adb6a5 100644 --- a/caffe2/python/attention.py +++ b/caffe2/python/attention.py @@ -1,9 +1,9 @@ ## @package attention # Module caffe2.python.attention -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py index 8393ca7875aa..84d0d46490b0 100644 --- a/caffe2/python/benchmark_generator.py +++ b/caffe2/python/benchmark_generator.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import string import argparse diff --git a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py index 9b9a196e9770..ce96dbc1dd63 100644 --- a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py +++ b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py index bdba35545255..1b683be0d51e 100644 --- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py +++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse import datetime diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py index 802d61025e30..39dba40df8a0 100644 --- a/caffe2/python/binarysize.py +++ b/caffe2/python/binarysize.py @@ -15,10 +15,10 @@ green, assuming that you have a xterm connection that supports color. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import subprocess import sys diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py index 2722c21d84d0..0e050ec32c44 100644 --- a/caffe2/python/brew.py +++ b/caffe2/python/brew.py @@ -1,9 +1,9 @@ ## @package model_helper_api # Module caffe2.python.model_helper_api -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import sys import copy diff --git a/caffe2/python/brew_test.py b/caffe2/python/brew_test.py index 8b3d08977c2c..4973876a8008 100644 --- a/caffe2/python/brew_test.py +++ b/caffe2/python/brew_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, core, scope, workspace from caffe2.python.modeling.parameter_info import ParameterTags diff --git a/caffe2/python/build.py b/caffe2/python/build.py index 0f447265d5f4..862c031004c5 100644 --- a/caffe2/python/build.py +++ b/caffe2/python/build.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python._import_c_extension as C diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py index 1dd179c71caf..980c4fe40e08 100644 --- a/caffe2/python/cached_reader.py +++ b/caffe2/python/cached_reader.py @@ -1,9 +1,9 @@ ## @package cached_reader # Module caffe2.python.cached_reader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py index cdd96eb1f492..9d7797fc3ada 100644 --- a/caffe2/python/checkpoint.py +++ b/caffe2/python/checkpoint.py @@ -1,9 +1,9 @@ ## @package checkpoint # Module caffe2.python.checkpoint -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import logging diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py index a91bbf9910e2..90746747dd98 100644 --- a/caffe2/python/checkpoint_test.py +++ b/caffe2/python/checkpoint_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.schema import Struct, ConstRecord from caffe2.python import core, workspace, model_helper diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py index aead1d599474..a0fd52e1fdbc 100644 --- a/caffe2/python/cnn.py +++ b/caffe2/python/cnn.py @@ -1,9 +1,9 @@ ## @package cnn # Module caffe2.python.cnn -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, workspace from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/context.py b/caffe2/python/context.py index 928807ba2805..28815bb7f36b 100644 --- a/caffe2/python/context.py +++ b/caffe2/python/context.py @@ -1,9 +1,9 @@ ## @package context # Module caffe2.python.context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import threading import six diff --git a/caffe2/python/context_test.py b/caffe2/python/context_test.py index 6a1f77f5ecf8..6c259d326a19 100644 --- a/caffe2/python/context_test.py +++ b/caffe2/python/context_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context, test_util from threading import Thread diff --git a/caffe2/python/control.py b/caffe2/python/control.py index dd332f745f9a..6b0654d6f26e 100644 --- a/caffe2/python/control.py +++ b/caffe2/python/control.py @@ -11,10 +11,10 @@ If """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from future.utils import viewitems diff --git a/caffe2/python/control_ops_grad.py b/caffe2/python/control_ops_grad.py index 5a8d24cf55d8..a0e85f4d0bc1 100644 --- a/caffe2/python/control_ops_grad.py +++ b/caffe2/python/control_ops_grad.py @@ -1,9 +1,9 @@ ## @package control_ops_grad # Module caffe2.python.control_ops_grad -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/control_ops_grad_test.py b/caffe2/python/control_ops_grad_test.py index a84b9ca0a168..f637e38a5e33 100644 --- a/caffe2/python/control_ops_grad_test.py +++ b/caffe2/python/control_ops_grad_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import core, test_util, workspace diff --git a/caffe2/python/control_ops_util.py b/caffe2/python/control_ops_util.py index 76ab14a7bc65..cfff82de318b 100644 --- a/caffe2/python/control_ops_util.py +++ b/caffe2/python/control_ops_util.py @@ -1,9 +1,9 @@ ## @package control_ops_util # Module caffe2.python.control_ops_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py index e51aeffa8b04..3f9df172d2b7 100644 --- a/caffe2/python/control_test.py +++ b/caffe2/python/control_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import control, core, test_util, workspace diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py index 44f81d6e2d13..18033661a69e 100644 --- a/caffe2/python/convert.py +++ b/caffe2/python/convert.py @@ -1,9 +1,9 @@ ## @package workspace # Module caffe2.python.workspace -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2, torch_pb2 diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py index 82c969c901ea..a1dc52aad2d9 100644 --- a/caffe2/python/convert_test.py +++ b/caffe2/python/convert_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import convert, workspace from caffe2.proto import caffe2_pb2, torch_pb2 diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 3b493277a182..6d7c503e2c81 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -1,9 +1,9 @@ ## @package core # Module caffe2.python.core -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from collections import namedtuple, OrderedDict, defaultdict from past.builtins import basestring diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index 8b229029f5f7..3674b7aa4585 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from future.utils import bytes_to_native_str from hypothesis import given, settings diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index 8660f5cc2106..b0f5b11f0d1c 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from inspect import currentframe, getframeinfo import unittest diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py index a009f8f0fa31..703ae604c654 100644 --- a/caffe2/python/crf.py +++ b/caffe2/python/crf.py @@ -1,6 +1,6 @@ ## @package crf # Module caffe2.python.crf -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python import brew, core, model_helper, recurrent diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py index dd1c8720bfb1..9bc0372c50c0 100644 --- a/caffe2/python/crf_predict.py +++ b/caffe2/python/crf_predict.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python.crf import CRFWithLoss diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py index 970a7c6d4a8f..052bbbf4e6bf 100644 --- a/caffe2/python/crf_viterbi_test.py +++ b/caffe2/python/crf_viterbi_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, crf from caffe2.python.cnn import CNNModelHelper diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 7f5527472cc2..95abb7159d42 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -1,8 +1,8 @@ ## @package data_parallel_model # Module caffe2.python.data_parallel_model -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from collections import OrderedDict from future.utils import viewitems, viewkeys, viewvalues diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py index e106dee97039..a0dbb3037c2c 100644 --- a/caffe2/python/data_parallel_model_test.py +++ b/caffe2/python/data_parallel_model_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from future.utils import viewkeys from multiprocessing import Process, Queue diff --git a/caffe2/python/data_workers.py b/caffe2/python/data_workers.py index eb49da78c0af..698a8953ef13 100644 --- a/caffe2/python/data_workers.py +++ b/caffe2/python/data_workers.py @@ -1,9 +1,9 @@ ## @package data_workers # Module caffe2.python.data_workers -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + ''' diff --git a/caffe2/python/data_workers_test.py b/caffe2/python/data_workers_test.py index 1abd8dfa28d7..4669aaf59476 100644 --- a/caffe2/python/data_workers_test.py +++ b/caffe2/python/data_workers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py index 5221262582ee..ff6e9c6860f6 100644 --- a/caffe2/python/dataio.py +++ b/caffe2/python/dataio.py @@ -15,10 +15,10 @@ See `dataset.py` for an example of implementation. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.schema import Field, Struct, from_blob_list diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py index 26f1c0902f71..0c45fb50aed9 100644 --- a/caffe2/python/dataio_test.py +++ b/caffe2/python/dataio_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.dataio import ( CompositeReader, diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py index 387dbbaead58..4c2d4c806476 100644 --- a/caffe2/python/dataset.py +++ b/caffe2/python/dataset.py @@ -10,10 +10,10 @@ is stored as a set of native Caffe2 tensors, thus no type conversion or deserialization is necessary. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.dataio import Reader, Writer diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py index 9296f1c6b7db..265b19251717 100644 --- a/caffe2/python/db_file_reader.py +++ b/caffe2/python/db_file_reader.py @@ -1,9 +1,9 @@ ## @package db_file_reader # Module caffe2.python.db_file_reader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope, workspace, _import_c_extension as C from caffe2.python.dataio import Reader diff --git a/caffe2/python/db_test.py b/caffe2/python/db_test.py index f642202b36f0..f0f5d2770dc0 100644 --- a/caffe2/python/db_test.py +++ b/caffe2/python/db_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py index 0a16420f6d5a..904f1731e960 100644 --- a/caffe2/python/docs/formatter.py +++ b/caffe2/python/docs/formatter.py @@ -1,9 +1,9 @@ ## @package formatter # Module caffe2.python.docs.formatter -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.docs.parser import Parser diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py index 1bc41b7d1ccb..c5a7df369bc2 100644 --- a/caffe2/python/docs/generator.py +++ b/caffe2/python/docs/generator.py @@ -1,9 +1,9 @@ ## @package generator # Module caffe2.python.docs.generator -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import os from caffe2.python import core, workspace diff --git a/caffe2/python/docs/github.py b/caffe2/python/docs/github.py index 5cb1fdcf5d7b..3fd78507346e 100644 --- a/caffe2/python/docs/github.py +++ b/caffe2/python/docs/github.py @@ -1,9 +1,9 @@ ## @package github # Module caffe2.python.docs.github -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import os from caffe2.python.docs.formatter import Markdown diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py index 024989c97e25..a4edb6e07246 100644 --- a/caffe2/python/docs/parser.py +++ b/caffe2/python/docs/parser.py @@ -1,9 +1,9 @@ ## @package parser # Module caffe2.python.docs.parser -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import re diff --git a/caffe2/python/dyndep.py b/caffe2/python/dyndep.py index 8bea14423875..0382cc3a8212 100644 --- a/caffe2/python/dyndep.py +++ b/caffe2/python/dyndep.py @@ -1,9 +1,9 @@ ## @package dyndep # Module caffe2.python.dyndep -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import ctypes import os diff --git a/caffe2/python/embedding_generation_benchmark.py b/caffe2/python/embedding_generation_benchmark.py index a4d66036b93d..33dbf757dda4 100644 --- a/caffe2/python/embedding_generation_benchmark.py +++ b/caffe2/python/embedding_generation_benchmark.py @@ -1,9 +1,9 @@ ## @package embedding_generation_benchmark # Module caffe2.python.embedding_generation_benchmark -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, utils, model_helper diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py index fb2059f94868..59e85431e8bf 100644 --- a/caffe2/python/examples/char_rnn.py +++ b/caffe2/python/examples/char_rnn.py @@ -1,9 +1,9 @@ ## @package char_rnn # Module caffe2.python.examples.char_rnn -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, model_helper, utils, brew from caffe2.python.rnn_cell import LSTM diff --git a/caffe2/python/examples/lmdb_create_example.py b/caffe2/python/examples/lmdb_create_example.py index b29b3b806001..af56069a7be0 100644 --- a/caffe2/python/examples/lmdb_create_example.py +++ b/caffe2/python/examples/lmdb_create_example.py @@ -1,9 +1,9 @@ ## @package lmdb_create_example # Module caffe2.python.examples.lmdb_create_example -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import numpy as np diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py index cbe9491d9cf6..822a0a2950ba 100644 --- a/caffe2/python/experiment_util.py +++ b/caffe2/python/experiment_util.py @@ -1,9 +1,9 @@ ## @package experiment_util # Module caffe2.python.experiment_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import datetime import time diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py index c533ae6d77bc..06c6707dcce9 100644 --- a/caffe2/python/extension_loader.py +++ b/caffe2/python/extension_loader.py @@ -1,9 +1,9 @@ ## @package extension_loader # Module caffe2.python.extension_loader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import contextlib import ctypes import sys diff --git a/caffe2/python/fakefp16_transform_lib.py b/caffe2/python/fakefp16_transform_lib.py index 885f15732055..c3f142061479 100644 --- a/caffe2/python/fakefp16_transform_lib.py +++ b/caffe2/python/fakefp16_transform_lib.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + import caffe2.python._import_c_extension as C from caffe2.proto.caffe2_pb2 import NetDef diff --git a/caffe2/python/fakelowp/init_shared_libs.py b/caffe2/python/fakelowp/init_shared_libs.py index d289c7c4a97d..2a98de4571aa 100644 --- a/caffe2/python/fakelowp/init_shared_libs.py +++ b/caffe2/python/fakelowp/init_shared_libs.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import ctypes import os diff --git a/caffe2/python/fakelowp/test_utils.py b/caffe2/python/fakelowp/test_utils.py index 75e4422f3ccc..4a31a92e5bce 100644 --- a/caffe2/python/fakelowp/test_utils.py +++ b/caffe2/python/fakelowp/test_utils.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import sys import numpy as np diff --git a/caffe2/python/filler_test.py b/caffe2/python/filler_test.py index 52ea756d5bea..9aff384e99af 100644 --- a/caffe2/python/filler_test.py +++ b/caffe2/python/filler_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, test_util, workspace diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py index 7c26f69a0c43..d32acb3d8a90 100644 --- a/caffe2/python/functional.py +++ b/caffe2/python/functional.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py index e7803e829bb4..d90943761aa4 100644 --- a/caffe2/python/functional_test.py +++ b/caffe2/python/functional_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py index d2ecf118ea27..a7e5d714b63c 100644 --- a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py +++ b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py index 1b492229a433..3f8dd83b5538 100644 --- a/caffe2/python/gradient_check_test.py +++ b/caffe2/python/gradient_check_test.py @@ -2,10 +2,10 @@ # can gradually remove this test script. DO NOT ADD MORE TESTS TO THIS # FILE. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import ( brew, diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py index b1cdcc2bbb56..afb8d5071492 100644 --- a/caffe2/python/gradient_checker.py +++ b/caffe2/python/gradient_checker.py @@ -1,9 +1,9 @@ ## @package gradient_checker # Module caffe2.python.gradient_checker -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py index e6caa2cae1eb..049a9152878a 100644 --- a/caffe2/python/gru_cell.py +++ b/caffe2/python/gru_cell.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools from caffe2.python import brew, rnn_cell diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py index 6bc3779a4ca1..948c55ac88ce 100644 --- a/caffe2/python/helpers/algebra.py +++ b/caffe2/python/helpers/algebra.py @@ -1,9 +1,9 @@ ## @package algebra # Module caffe2.python.helpers.algebra -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def transpose(model, blob_in, blob_out, use_cudnn=False, **kwargs): diff --git a/caffe2/python/helpers/arg_scope.py b/caffe2/python/helpers/arg_scope.py index ac6978be8064..a112e9b84c5d 100644 --- a/caffe2/python/helpers/arg_scope.py +++ b/caffe2/python/helpers/arg_scope.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import contextlib import copy import threading diff --git a/caffe2/python/helpers/array_helpers.py b/caffe2/python/helpers/array_helpers.py index 3f8955331d4e..fae0011bf1f6 100644 --- a/caffe2/python/helpers/array_helpers.py +++ b/caffe2/python/helpers/array_helpers.py @@ -1,9 +1,9 @@ ## @package arra_helpers # Module caffe2.python.helpers.array_helpers -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def concat(model, blobs_in, blob_out, **kwargs): diff --git a/caffe2/python/helpers/control_ops.py b/caffe2/python/helpers/control_ops.py index a738a71fe44c..c6f71d0761a5 100644 --- a/caffe2/python/helpers/control_ops.py +++ b/caffe2/python/helpers/control_ops.py @@ -1,9 +1,9 @@ ## @package control_ops # Module caffe2.python.helpers.control_ops -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.control_ops_util import add_if_op, add_while_op diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py index bb88b2e3757f..dfca165084df 100644 --- a/caffe2/python/helpers/conv.py +++ b/caffe2/python/helpers/conv.py @@ -1,9 +1,9 @@ ## @package conv # Module caffe2.python.helpers.conv -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.modeling import initializers diff --git a/caffe2/python/helpers/db_input.py b/caffe2/python/helpers/db_input.py index 6e642a393da4..d5772cb7653e 100644 --- a/caffe2/python/helpers/db_input.py +++ b/caffe2/python/helpers/db_input.py @@ -1,9 +1,9 @@ ## @package db_input # Module caffe2.python.helpers.db_input -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def db_input(model, blobs_out, batch_size, db, db_type): dbreader_name = "dbreader_" + db diff --git a/caffe2/python/helpers/dropout.py b/caffe2/python/helpers/dropout.py index 6fbb5bcda99a..d7280318f60d 100644 --- a/caffe2/python/helpers/dropout.py +++ b/caffe2/python/helpers/dropout.py @@ -1,9 +1,9 @@ ## @package dropout # Module caffe2.python.helpers.dropout -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def dropout(model, blob_in, blob_out, use_cudnn=False, **kwargs): diff --git a/caffe2/python/helpers/elementwise_linear.py b/caffe2/python/helpers/elementwise_linear.py index 55fbd708489c..ef9184d00dd2 100644 --- a/caffe2/python/helpers/elementwise_linear.py +++ b/caffe2/python/helpers/elementwise_linear.py @@ -1,9 +1,9 @@ ## @package elementwise_linear # Module caffe2.python.helpers.elementwise_linear -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.modeling.parameter_info import ParameterTags diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py index 9d61dc7ac145..0feb2b65745e 100644 --- a/caffe2/python/helpers/fc.py +++ b/caffe2/python/helpers/fc.py @@ -1,9 +1,9 @@ ## @package fc # Module caffe2.python.helpers.fc -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.modeling import initializers diff --git a/caffe2/python/helpers/nonlinearity.py b/caffe2/python/helpers/nonlinearity.py index f773cc3114de..3a8be3bb056a 100644 --- a/caffe2/python/helpers/nonlinearity.py +++ b/caffe2/python/helpers/nonlinearity.py @@ -1,9 +1,9 @@ ## @package nonlinearity # Module caffe2.python.helpers.nonlinearity -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core diff --git a/caffe2/python/helpers/normalization.py b/caffe2/python/helpers/normalization.py index 621f565b5455..b13b43f6859a 100644 --- a/caffe2/python/helpers/normalization.py +++ b/caffe2/python/helpers/normalization.py @@ -1,9 +1,9 @@ ## @package normalization # Module caffe2.python.helpers.normalization -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope from caffe2.python.modeling.parameter_info import ParameterTags diff --git a/caffe2/python/helpers/pooling.py b/caffe2/python/helpers/pooling.py index 412d55434d16..9e6fc784f289 100644 --- a/caffe2/python/helpers/pooling.py +++ b/caffe2/python/helpers/pooling.py @@ -2,10 +2,10 @@ # Module caffe2.python.helpers.pooling ## @package fc # Module caffe2.python.helpers.pooling -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def max_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs): diff --git a/caffe2/python/helpers/tools.py b/caffe2/python/helpers/tools.py index 59defe9e236b..178620eab593 100644 --- a/caffe2/python/helpers/tools.py +++ b/caffe2/python/helpers/tools.py @@ -1,9 +1,9 @@ ## @package tools # Module caffe2.python.helpers.tools -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def image_input( diff --git a/caffe2/python/helpers/train.py b/caffe2/python/helpers/train.py index bee36347808a..02883af7402d 100644 --- a/caffe2/python/helpers/train.py +++ b/caffe2/python/helpers/train.py @@ -1,9 +1,9 @@ ## @package train # Module caffe2.python.helpers.train -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/hip_test_util.py b/caffe2/python/hip_test_util.py index 3910c9e5c2ce..beab3be1c40a 100644 --- a/caffe2/python/hip_test_util.py +++ b/caffe2/python/hip_test_util.py @@ -6,10 +6,10 @@ operators. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/hsm_util.py b/caffe2/python/hsm_util.py index e98056f9cd88..ec465c12240e 100644 --- a/caffe2/python/hsm_util.py +++ b/caffe2/python/hsm_util.py @@ -1,9 +1,9 @@ ## @package hsm_util # Module caffe2.python.hsm_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import hsm_pb2 diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py index 897be5fab44a..045677f8422a 100644 --- a/caffe2/python/hypothesis_test.py +++ b/caffe2/python/hypothesis_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np import copy @@ -10,7 +10,7 @@ from hypothesis import assume, given, settings, HealthCheck import hypothesis.strategies as st import unittest -import os +import threading from caffe2.python import core, workspace, tt_core, dyndep import caffe2.python.hypothesis_test_util as hu @@ -2695,6 +2695,60 @@ def histogram(X): self.assertDeviceChecks(dc, op, [X], [0, 1]) self.assertReferenceChecks(gc, op, [X], histogram) + @settings(max_examples=1, deadline=None) + @given( + queue_capacity=st.integers(2, 2), + time_sleep=st.integers(5, 10), + num_blobs_to_equeue=st.integers(1, 1), + num_blobs_to_dequeue=st.integers(2, 2), + ) + def test_safe_dequeue_blob__raises_exception_when_hang( + self, + queue_capacity, + time_sleep, + num_blobs_to_equeue, + num_blobs_to_dequeue, + ): + r""" + Tests SafeDequeueBlobsOp being cancellable. + + Create a queue with the number of BlobsQueue less than the number + SafeDequeueBlobs to cause the hanging behavior when running the Net. + + Then call cancel from the previous sleeping thread to ensure exception + is raised. + """ + + def _net_instance_cancel(net_instance): + time.sleep(time_sleep) + net_instance.cancel() + + init_net = core.Net("init_net") + init_net.Proto().type = "async_scheduling" + + queue = init_net.CreateBlobsQueue( + [], + "queue_name", + capacity=queue_capacity, + num_blobs=num_blobs_to_equeue, + ) + + ws = workspace.Workspace() + ws.create_net(init_net).run() + + net = core.Net("net") + net.Proto().type = "async_scheduling" + + blobs = net.SafeDequeueBlobs([queue], num_blobs_to_dequeue) + + net_instance = ws.create_net(net) + + t = threading.Thread(target=_net_instance_cancel, args=[net_instance]) + t.start() + + with self.assertRaises(Exception): + net_instance.run() + t.join() if __name__ == "__main__": diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index 797010b46890..2000e269969e 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -34,10 +34,10 @@ implemented on the CPU. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import ( workspace, device_checker, gradient_checker, test_util, core) diff --git a/caffe2/python/ideep/LRN_op_test.py b/caffe2/python/ideep/LRN_op_test.py index 956f10be8831..23ecd79062f7 100644 --- a/caffe2/python/ideep/LRN_op_test.py +++ b/caffe2/python/ideep/LRN_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/adam_op_test.py b/caffe2/python/ideep/adam_op_test.py index a0d9b2ce014f..5ac0395bff63 100644 --- a/caffe2/python/ideep/adam_op_test.py +++ b/caffe2/python/ideep/adam_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep/blobs_queue_db_test.py b/caffe2/python/ideep/blobs_queue_db_test.py index ded18e89c5ae..966fcc23d47d 100644 --- a/caffe2/python/ideep/blobs_queue_db_test.py +++ b/caffe2/python/ideep/blobs_queue_db_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/channel_shuffle_op_test.py b/caffe2/python/ideep/channel_shuffle_op_test.py index 8c3eea3d8618..b4cedca61061 100644 --- a/caffe2/python/ideep/channel_shuffle_op_test.py +++ b/caffe2/python/ideep/channel_shuffle_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/concat_split_op_test.py b/caffe2/python/ideep/concat_split_op_test.py index c28a7f1fe52c..75c9ceeba0e4 100644 --- a/caffe2/python/ideep/concat_split_op_test.py +++ b/caffe2/python/ideep/concat_split_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py index e82d8aec5515..ae4473ea4864 100644 --- a/caffe2/python/ideep/conv_op_test.py +++ b/caffe2/python/ideep/conv_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import sys diff --git a/caffe2/python/ideep/conv_transpose_test.py b/caffe2/python/ideep/conv_transpose_test.py index be35dbd8a382..eeda2ea43a2d 100644 --- a/caffe2/python/ideep/conv_transpose_test.py +++ b/caffe2/python/ideep/conv_transpose_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py index f24333745741..18ce574b623b 100644 --- a/caffe2/python/ideep/convfusion_op_test.py +++ b/caffe2/python/ideep/convfusion_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/copy_op_test.py b/caffe2/python/ideep/copy_op_test.py index 4b0a15bd999a..668282f2e159 100644 --- a/caffe2/python/ideep/copy_op_test.py +++ b/caffe2/python/ideep/copy_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py index efecfb501bff..33b0a52a7421 100644 --- a/caffe2/python/ideep/dropout_op_test.py +++ b/caffe2/python/ideep/dropout_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from hypothesis import given diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py index 9daf34088fc0..11a35d6b2b28 100644 --- a/caffe2/python/ideep/elementwise_sum_op_test.py +++ b/caffe2/python/ideep/elementwise_sum_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/expanddims_squeeze_op_test.py b/caffe2/python/ideep/expanddims_squeeze_op_test.py index 4a4fb7319b25..3693a217bb4b 100644 --- a/caffe2/python/ideep/expanddims_squeeze_op_test.py +++ b/caffe2/python/ideep/expanddims_squeeze_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py index 9e29bfaed919..6549bb6ad6bb 100644 --- a/caffe2/python/ideep/fc_op_test.py +++ b/caffe2/python/ideep/fc_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from functools import reduce diff --git a/caffe2/python/ideep/leaky_relu_op_test.py b/caffe2/python/ideep/leaky_relu_op_test.py index 8a68d2e608ef..6d84f88f4fe2 100644 --- a/caffe2/python/ideep/leaky_relu_op_test.py +++ b/caffe2/python/ideep/leaky_relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py index 06d0e9be0e57..596bab0ad3cc 100644 --- a/caffe2/python/ideep/moment_sgd_op_test.py +++ b/caffe2/python/ideep/moment_sgd_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py index 6d40a88b5c13..dc928c264082 100644 --- a/caffe2/python/ideep/operator_fallback_op_test.py +++ b/caffe2/python/ideep/operator_fallback_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py index 8a967dcf9c08..a259e01bab10 100644 --- a/caffe2/python/ideep/order_switch_op_test.py +++ b/caffe2/python/ideep/order_switch_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py index 9659d3961338..9ab3fcddbadb 100644 --- a/caffe2/python/ideep/pool_op_test.py +++ b/caffe2/python/ideep/pool_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/pre_convert_test.py b/caffe2/python/ideep/pre_convert_test.py index a32eedd74469..6c0b7ca5d7a7 100644 --- a/caffe2/python/ideep/pre_convert_test.py +++ b/caffe2/python/ideep/pre_convert_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py index bd05c69381c5..e2fda68aed2b 100644 --- a/caffe2/python/ideep/relu_op_test.py +++ b/caffe2/python/ideep/relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/reshape_op_test.py b/caffe2/python/ideep/reshape_op_test.py index c9714f6eb4a5..c2bca948a52c 100644 --- a/caffe2/python/ideep/reshape_op_test.py +++ b/caffe2/python/ideep/reshape_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.test_util import TestCase from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py index e1ab30c12e45..47114832f85d 100644 --- a/caffe2/python/ideep/shape_op_test.py +++ b/caffe2/python/ideep/shape_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/sigmoid_op_test.py b/caffe2/python/ideep/sigmoid_op_test.py index b67932108084..2b5eb0e3a2b5 100644 --- a/caffe2/python/ideep/sigmoid_op_test.py +++ b/caffe2/python/ideep/sigmoid_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/softmax_op_test.py b/caffe2/python/ideep/softmax_op_test.py index 9043061514a0..b76d6509609b 100644 --- a/caffe2/python/ideep/softmax_op_test.py +++ b/caffe2/python/ideep/softmax_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py index 25b83e2447fc..618a0e7fbfc3 100644 --- a/caffe2/python/ideep/spatial_bn_op_test.py +++ b/caffe2/python/ideep/spatial_bn_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py index b0483cf4c4b6..aa1c5bc260fa 100644 --- a/caffe2/python/ideep/test_ideep_net.py +++ b/caffe2/python/ideep/test_ideep_net.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py index 6345b76735a7..962d4051718b 100644 --- a/caffe2/python/ideep/transform_ideep_net.py +++ b/caffe2/python/ideep/transform_ideep_net.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import copy diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py index b02085a3ba3b..8b324ed964ae 100644 --- a/caffe2/python/ideep/transpose_op_test.py +++ b/caffe2/python/ideep/transpose_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/ideep/weightedsum_op_test.py b/caffe2/python/ideep/weightedsum_op_test.py index 2a0b3ec3e7b0..b1e46fca4851 100644 --- a/caffe2/python/ideep/weightedsum_op_test.py +++ b/caffe2/python/ideep/weightedsum_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py index e131ee027c35..7129ed14ba74 100644 --- a/caffe2/python/ideep_test_util.py +++ b/caffe2/python/ideep_test_util.py @@ -6,10 +6,10 @@ operators. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py index 90e5a4d76b6d..7c3dda3b320c 100644 --- a/caffe2/python/layer_model_helper.py +++ b/caffe2/python/layer_model_helper.py @@ -1,9 +1,9 @@ # @package layer_model_helper # Module caffe2.python.layer_model_helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, model_helper, schema, scope, utils, muji from caffe2.python.modeling.parameter_info import ( diff --git a/caffe2/python/layer_model_instantiator.py b/caffe2/python/layer_model_instantiator.py index 9ceb1310bf30..9284b9b9e687 100644 --- a/caffe2/python/layer_model_instantiator.py +++ b/caffe2/python/layer_model_instantiator.py @@ -1,9 +1,9 @@ ## @package layer_model_instantiator # Module caffe2.python.layer_model_instantiator -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import InstantiationContext diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py index 5d87dbd7522a..518412b9e90c 100644 --- a/caffe2/python/layer_parameter_sharing_test.py +++ b/caffe2/python/layer_parameter_sharing_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope from caffe2.python.modeling.parameter_sharing import ( diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py index 2f2e23062e34..ae28e82b98cc 100644 --- a/caffe2/python/layer_test_util.py +++ b/caffe2/python/layer_test_util.py @@ -1,9 +1,9 @@ ## @package layer_test_util # Module caffe2.python.layer_test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from collections import namedtuple diff --git a/caffe2/python/layers/__init__.py b/caffe2/python/layers/__init__.py index 2a09dc8419a6..487b7751fd08 100644 --- a/caffe2/python/layers/__init__.py +++ b/caffe2/python/layers/__init__.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from importlib import import_module import pkgutil diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py index c081e8573038..146a0bdb1974 100644 --- a/caffe2/python/layers/adaptive_weight.py +++ b/caffe2/python/layers/adaptive_weight.py @@ -1,6 +1,6 @@ # @package adaptive_weight # Module caffe2.fb.python.layers.adaptive_weight -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python import core, schema diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py index 0ffa46afb2b3..1a0fd8b295f3 100644 --- a/caffe2/python/layers/add_bias.py +++ b/caffe2/python/layers/add_bias.py @@ -1,9 +1,9 @@ ## @package add_bias # Module caffe2.python.layers.add_bias -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py index 2409eca551a1..89c5014f5c5c 100644 --- a/caffe2/python/layers/arc_cosine_feature_map.py +++ b/caffe2/python/layers/arc_cosine_feature_map.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py index 48b6ebcf8f58..0a5323625419 100644 --- a/caffe2/python/layers/batch_huber_loss.py +++ b/caffe2/python/layers/batch_huber_loss.py @@ -1,9 +1,9 @@ # @package batch_huber_loss # Module caffe2.python.layers.batch_huber_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py index a560a3f654a9..46b0e4d42cdf 100644 --- a/caffe2/python/layers/batch_lr_loss.py +++ b/caffe2/python/layers/batch_lr_loss.py @@ -1,9 +1,9 @@ ## @package batch_lr_loss # Module caffe2.python.layers.batch_lr_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py index 89da74f3c1e9..b0dd63ab09c8 100644 --- a/caffe2/python/layers/batch_mse_loss.py +++ b/caffe2/python/layers/batch_mse_loss.py @@ -1,9 +1,9 @@ ## @package batch_mse_loss # Module caffe2.python.layers.batch_mse_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py index 9fe3ee51eb56..6395b09ff67f 100644 --- a/caffe2/python/layers/batch_normalization.py +++ b/caffe2/python/layers/batch_normalization.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py index 9ef8cf563dbe..84e7d4873f50 100644 --- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py +++ b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py @@ -1,9 +1,9 @@ ## @package batch_sigmoid_cross_entropy_loss # Module caffe2.python.layers.batch_sigmoid_cross_entropy_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py index d5f9413ef96a..30667a04c159 100644 --- a/caffe2/python/layers/batch_softmax_loss.py +++ b/caffe2/python/layers/batch_softmax_loss.py @@ -1,9 +1,9 @@ ## @package batch_softmax_loss # Module caffe2.python.layers.batch_softmax_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py index cf8ecfd99045..a37fab463581 100644 --- a/caffe2/python/layers/blob_weighted_sum.py +++ b/caffe2/python/layers/blob_weighted_sum.py @@ -1,9 +1,9 @@ ## @package BlobWeightedSum # Module caffe2.python.layers.blob_weighted_sum -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/bpr_loss.py b/caffe2/python/layers/bpr_loss.py index 4e6a60fdaa57..389de8c241e8 100644 --- a/caffe2/python/layers/bpr_loss.py +++ b/caffe2/python/layers/bpr_loss.py @@ -1,9 +1,9 @@ ## @package bpr_loss # Module caffe2.python.layers.bpr_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/bucket_weighted.py b/caffe2/python/layers/bucket_weighted.py index 3c750e7b136f..2c200a922fdd 100644 --- a/caffe2/python/layers/bucket_weighted.py +++ b/caffe2/python/layers/bucket_weighted.py @@ -1,9 +1,9 @@ ## @package bucket_weighted # Module caffe2.python.layers.bucket_weighted -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging import numpy as np diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py index b8c999bc256e..29c63f3d8948 100644 --- a/caffe2/python/layers/build_index.py +++ b/caffe2/python/layers/build_index.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py index 062485757edc..fb1dc6ab6dbf 100644 --- a/caffe2/python/layers/concat.py +++ b/caffe2/python/layers/concat.py @@ -1,9 +1,9 @@ ## @package concat # Module caffe2.python.layers.concat -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/constant_weight.py b/caffe2/python/layers/constant_weight.py index 06e9d9cd9b66..d160ed8206b3 100644 --- a/caffe2/python/layers/constant_weight.py +++ b/caffe2/python/layers/constant_weight.py @@ -1,9 +1,9 @@ # @package constant_weight # Module caffe2.fb.python.layers.constant_weight -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py index bb22acf0cafa..e98bac7e2d80 100644 --- a/caffe2/python/layers/conv.py +++ b/caffe2/python/layers/conv.py @@ -1,9 +1,9 @@ ## @package conv # Module caffe2.python.layers.conv -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py index a5d3f01a440e..4bc0cf2785b2 100644 --- a/caffe2/python/layers/dropout.py +++ b/caffe2/python/layers/dropout.py @@ -1,8 +1,8 @@ # Module caffe2.python.layers.dropout -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py index a9eeceff2c21..9220f22165a3 100644 --- a/caffe2/python/layers/fc.py +++ b/caffe2/python/layers/fc.py @@ -1,9 +1,9 @@ ## @package fc # Module caffe2.python.layers.fc -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.helpers.arg_scope import get_current_scope from caffe2.python import schema diff --git a/caffe2/python/layers/fc_with_bootstrap.py b/caffe2/python/layers/fc_with_bootstrap.py index 6a48f572ddba..b3c2eb346f96 100644 --- a/caffe2/python/layers/fc_with_bootstrap.py +++ b/caffe2/python/layers/fc_with_bootstrap.py @@ -1,6 +1,6 @@ ## @package fc_with_bootstrap # Module caffe2.python.layers.fc_with_bootstrap -from __future__ import absolute_import, division, print_function, unicode_literals + import math diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py index e8923a8e5b9c..2899af618b79 100644 --- a/caffe2/python/layers/fc_without_bias.py +++ b/caffe2/python/layers/fc_without_bias.py @@ -1,9 +1,9 @@ ## @package fc_without_bias # Module caffe2.python.layers.fc_without_bias -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py index 69fe91a48691..ca004d136ded 100644 --- a/caffe2/python/layers/feature_sparse_to_dense.py +++ b/caffe2/python/layers/feature_sparse_to_dense.py @@ -1,6 +1,6 @@ # @package sparse_to_dense # Module caffe2.python.layers.sparse_to_dense -from __future__ import absolute_import, division, print_function, unicode_literals + from collections import defaultdict diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py index 53d5c050242f..c6d156fd68ce 100644 --- a/caffe2/python/layers/functional.py +++ b/caffe2/python/layers/functional.py @@ -1,9 +1,9 @@ # @package functional # Module caffe2.python.layers.functional -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema, scope, workspace from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py index 1289c097902c..da468d5db90c 100644 --- a/caffe2/python/layers/gather_record.py +++ b/caffe2/python/layers/gather_record.py @@ -1,9 +1,9 @@ ## @package gather_record # Module caffe2.python.layers.gather_record -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/homotopy_weight.py b/caffe2/python/layers/homotopy_weight.py index 63da1f04abf4..4c24223cbc8d 100644 --- a/caffe2/python/layers/homotopy_weight.py +++ b/caffe2/python/layers/homotopy_weight.py @@ -1,10 +1,10 @@ # @package homotopy_weight # Module caffe2.fb.python.layers.homotopy_weight -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py index e2282e051611..7e4987270660 100644 --- a/caffe2/python/layers/label_smooth.py +++ b/caffe2/python/layers/label_smooth.py @@ -15,10 +15,10 @@ # @package label_smooth # Module caffe2.python.layers.label_smooth -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py index fb93effbff2d..a16b731a2f78 100644 --- a/caffe2/python/layers/last_n_window_collector.py +++ b/caffe2/python/layers/last_n_window_collector.py @@ -1,9 +1,9 @@ ## @package last_n_window_collector # Module caffe2.python.layers.last_n_window_collector -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py index 0dc6795994cb..580a03bfc5da 100644 --- a/caffe2/python/layers/layer_normalization.py +++ b/caffe2/python/layers/layer_normalization.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py index 216d0b2e3286..abcdd1596220 100644 --- a/caffe2/python/layers/layers.py +++ b/caffe2/python/layers/layers.py @@ -1,6 +1,6 @@ ## @package layers # Module caffe2.python.layers.layers -from __future__ import absolute_import, division, print_function, unicode_literals + import logging from collections import namedtuple diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py index 15267752caa3..6f97ade23ef4 100644 --- a/caffe2/python/layers/margin_rank_loss.py +++ b/caffe2/python/layers/margin_rank_loss.py @@ -1,9 +1,9 @@ ## @package random_neg_rank_loss # Module caffe2.python.layers.random_neg_rank_loss -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema, core from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py index 117dd7904787..68c27b587567 100644 --- a/caffe2/python/layers/merge_id_lists.py +++ b/caffe2/python/layers/merge_id_lists.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py index 30cb6ace2b81..5020e5432c2a 100644 --- a/caffe2/python/layers/pairwise_similarity.py +++ b/caffe2/python/layers/pairwise_similarity.py @@ -1,9 +1,9 @@ ## @package dot_product # Module caffe2.python.layers.dot_product -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py index 19ddda2b6dcf..12e26bcd774e 100644 --- a/caffe2/python/layers/position_weighted.py +++ b/caffe2/python/layers/position_weighted.py @@ -1,9 +1,9 @@ ## @package position_weighted # Module caffe2.python.layers.position_weighted -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging import numpy as np diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py index 6056da4ba7cf..bde05ab97147 100644 --- a/caffe2/python/layers/random_fourier_features.py +++ b/caffe2/python/layers/random_fourier_features.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py index 3819a1971da4..21b9c44f2a79 100644 --- a/caffe2/python/layers/reservoir_sampling.py +++ b/caffe2/python/layers/reservoir_sampling.py @@ -1,9 +1,9 @@ ## @package reservoir_sampling # Module caffe2.python.layers.reservoir_sampling -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.layers.layers import ModelLayer diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py index 1c617326da7f..034c897e2c2f 100644 --- a/caffe2/python/layers/sampling_train.py +++ b/caffe2/python/layers/sampling_train.py @@ -1,9 +1,9 @@ ## @package sampling_train # Module caffe2.python.layers.sampling_train -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ModelLayer, get_layer_class diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py index 911fd8391e3f..403cc5a4a51c 100644 --- a/caffe2/python/layers/sampling_trainable_mixin.py +++ b/caffe2/python/layers/sampling_trainable_mixin.py @@ -1,9 +1,9 @@ ## @package sampling_trainable_mixin # Module caffe2.python.layers.sampling_trainable_mixin -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import abc import six diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py index 65e44bece97c..49e42ca308d7 100644 --- a/caffe2/python/layers/select_record_by_context.py +++ b/caffe2/python/layers/select_record_by_context.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py index d7b96d956d08..58f30ac71f19 100644 --- a/caffe2/python/layers/semi_random_features.py +++ b/caffe2/python/layers/semi_random_features.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap diff --git a/caffe2/python/layers/sparse_dropout_with_replacement.py b/caffe2/python/layers/sparse_dropout_with_replacement.py index 8275d83d8734..3e03888e57dc 100644 --- a/caffe2/python/layers/sparse_dropout_with_replacement.py +++ b/caffe2/python/layers/sparse_dropout_with_replacement.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py index 3927b199fbdf..c3ada99dc4a7 100644 --- a/caffe2/python/layers/sparse_feature_hash.py +++ b/caffe2/python/layers/sparse_feature_hash.py @@ -1,9 +1,9 @@ ## @package sparse_feature_hash # Module caffe2.python.layers.sparse_feature_hash -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema, core from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py index 30cb60266c4d..dd1c42606063 100644 --- a/caffe2/python/layers/sparse_lookup.py +++ b/caffe2/python/layers/sparse_lookup.py @@ -1,9 +1,9 @@ ## @package sparse_lookup # Module caffe2.python.layers.sparse_lookup -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.optimizer import FP16_ENGINES, Optimizer from caffe2.python.helpers.arg_scope import get_current_scope diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py index a83881f5a091..58e569a272c7 100644 --- a/caffe2/python/layers/split.py +++ b/caffe2/python/layers/split.py @@ -1,9 +1,9 @@ ## @package split # Module caffe2.python.layers.split -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import schema from caffe2.python.layers.layers import ( diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py index 28b7312dbcaa..5161ee2e1a96 100644 --- a/caffe2/python/layers/tags.py +++ b/caffe2/python/layers/tags.py @@ -1,9 +1,9 @@ ## @package tags # Module caffe2.python.layers.tags -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import six diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py index 46ed29bbaa41..5581371d008d 100644 --- a/caffe2/python/layers/uniform_sampling.py +++ b/caffe2/python/layers/uniform_sampling.py @@ -1,9 +1,9 @@ ## @package uniform_sampling # Module caffe2.python.layers.uniform_sampling -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py index 4d037a891ade..e084a011d357 100644 --- a/caffe2/python/layers_test.py +++ b/caffe2/python/layers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/lazy_dyndep.py b/caffe2/python/lazy_dyndep.py index e1799838f4b2..e53d4fda350b 100644 --- a/caffe2/python/lazy_dyndep.py +++ b/caffe2/python/lazy_dyndep.py @@ -1,9 +1,9 @@ ## @package lazy_dyndep # Module caffe2.python.lazy_dyndep -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os from caffe2.python import dyndep, lazy diff --git a/caffe2/python/lazy_dyndep_test.py b/caffe2/python/lazy_dyndep_test.py index 881215ac36e3..1441facd3a6f 100644 --- a/caffe2/python/lazy_dyndep_test.py +++ b/caffe2/python/lazy_dyndep_test.py @@ -1,9 +1,9 @@ #!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py index f08e9147d3ba..718b7fb3a987 100644 --- a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py +++ b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py index d73db5aaa36c..a38d442dd952 100644 --- a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py +++ b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py index cfa53a81610c..29f819ec622e 100644 --- a/caffe2/python/lstm_benchmark.py +++ b/caffe2/python/lstm_benchmark.py @@ -1,9 +1,9 @@ ## @package lstm_benchmark # Module caffe2.python.lstm_benchmark -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, utils, rnn_cell, model_helper diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py index c299c817ace4..a728fc4e2157 100644 --- a/caffe2/python/memonger.py +++ b/caffe2/python/memonger.py @@ -1,9 +1,9 @@ ## @package memonger # Module caffe2.python.memonger -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import networkx as nx import collections diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py index 7d5c52224b1c..8584e8d5e4cc 100644 --- a/caffe2/python/memonger_test.py +++ b/caffe2/python/memonger_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py index 73df4820a5d1..2b084bea591b 100644 --- a/caffe2/python/mkl/mkl_LRN_op_test.py +++ b/caffe2/python/mkl/mkl_LRN_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py index 35eae62d5be1..ae42902d9102 100644 --- a/caffe2/python/mkl/mkl_LRN_speed_test.py +++ b/caffe2/python/mkl/mkl_LRN_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_concat_op_test.py b/caffe2/python/mkl/mkl_concat_op_test.py index a1a96ca755d9..8b01f8885b1c 100644 --- a/caffe2/python/mkl/mkl_concat_op_test.py +++ b/caffe2/python/mkl/mkl_concat_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py index 38ceb680bb6d..f1fe7b062318 100644 --- a/caffe2/python/mkl/mkl_conv_op_test.py +++ b/caffe2/python/mkl/mkl_conv_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_copy_op_test.py b/caffe2/python/mkl/mkl_copy_op_test.py index 633865cd5047..b2baeb9ef1af 100644 --- a/caffe2/python/mkl/mkl_copy_op_test.py +++ b/caffe2/python/mkl/mkl_copy_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_elementwise_add_op_test.py b/caffe2/python/mkl/mkl_elementwise_add_op_test.py index eab454ffe105..0709b5afd9f6 100644 --- a/caffe2/python/mkl/mkl_elementwise_add_op_test.py +++ b/caffe2/python/mkl/mkl_elementwise_add_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py index 71e0754a0214..3adec4848e50 100644 --- a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py +++ b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py index 01e8c9b5a925..01786d55c337 100644 --- a/caffe2/python/mkl/mkl_fc_op_test.py +++ b/caffe2/python/mkl/mkl_fc_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py index 7cabadfe1da0..85f5605e9676 100644 --- a/caffe2/python/mkl/mkl_fc_speed_test.py +++ b/caffe2/python/mkl/mkl_fc_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py index dbdf12c1aca4..26a9b7131b0b 100644 --- a/caffe2/python/mkl/mkl_fill_op_test.py +++ b/caffe2/python/mkl/mkl_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_pool_op_test.py b/caffe2/python/mkl/mkl_pool_op_test.py index b733edaace1c..a56e9448317a 100644 --- a/caffe2/python/mkl/mkl_pool_op_test.py +++ b/caffe2/python/mkl/mkl_pool_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py index a0fa8ca6ece8..b25e0f915cc7 100644 --- a/caffe2/python/mkl/mkl_pool_speed_test.py +++ b/caffe2/python/mkl/mkl_pool_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_relu_op_test.py b/caffe2/python/mkl/mkl_relu_op_test.py index 90e365da554b..76ec33bcbe91 100644 --- a/caffe2/python/mkl/mkl_relu_op_test.py +++ b/caffe2/python/mkl/mkl_relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py index 4a5fad2b7b68..2ac9080ce670 100644 --- a/caffe2/python/mkl/mkl_sbn_op_test.py +++ b/caffe2/python/mkl/mkl_sbn_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py index d37bef32b9b7..3b3b71d1c997 100644 --- a/caffe2/python/mkl/mkl_sbn_speed_test.py +++ b/caffe2/python/mkl/mkl_sbn_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_sigmoid_op_test.py b/caffe2/python/mkl/mkl_sigmoid_op_test.py index 654008c67b7d..abdb0983778d 100644 --- a/caffe2/python/mkl/mkl_sigmoid_op_test.py +++ b/caffe2/python/mkl/mkl_sigmoid_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py index 4034705580d5..9a7310a484d1 100644 --- a/caffe2/python/mkl/mkl_speed_test.py +++ b/caffe2/python/mkl/mkl_speed_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl/mkl_squeeze_op_test.py b/caffe2/python/mkl/mkl_squeeze_op_test.py index 1e4b5791b0b6..8af090f60d88 100644 --- a/caffe2/python/mkl/mkl_squeeze_op_test.py +++ b/caffe2/python/mkl/mkl_squeeze_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index c003e0e3b09b..3a88a3deeccc 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import copy from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py index 42e3269fc4d8..1ad209cdbdfd 100644 --- a/caffe2/python/mkl/rewrite_graph_test.py +++ b/caffe2/python/mkl/rewrite_graph_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/mkl_test_util.py b/caffe2/python/mkl_test_util.py index 5d8f66500190..88fb3cc800ec 100644 --- a/caffe2/python/mkl_test_util.py +++ b/caffe2/python/mkl_test_util.py @@ -6,10 +6,10 @@ operators. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py index a26bf844f2de..a5a4865c0ec1 100644 --- a/caffe2/python/model_helper.py +++ b/caffe2/python/model_helper.py @@ -1,9 +1,9 @@ ## @package model_helper # Module caffe2.python.model_helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope, workspace from caffe2.python.helpers.db_input import db_input diff --git a/caffe2/python/model_helper_test.py b/caffe2/python/model_helper_test.py index fcccddf401db..1423e4a97733 100644 --- a/caffe2/python/model_helper_test.py +++ b/caffe2/python/model_helper_test.py @@ -1,6 +1,6 @@ """unittest for ModelHelper class""" -from __future__ import absolute_import, division, print_function + import unittest diff --git a/caffe2/python/modeling/compute_histogram_for_blobs.py b/caffe2/python/modeling/compute_histogram_for_blobs.py index 3b5ea4b64cba..ea83f96f7019 100644 --- a/caffe2/python/modeling/compute_histogram_for_blobs.py +++ b/caffe2/python/modeling/compute_histogram_for_blobs.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/compute_histogram_for_blobs_test.py b/caffe2/python/modeling/compute_histogram_for_blobs_test.py index 6c3b59950898..4ce6bf11487a 100644 --- a/caffe2/python/modeling/compute_histogram_for_blobs_test.py +++ b/caffe2/python/modeling/compute_histogram_for_blobs_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py index 41b7f88d24eb..24ed7a7482c7 100644 --- a/caffe2/python/modeling/compute_norm_for_blobs.py +++ b/caffe2/python/modeling/compute_norm_for_blobs.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema, muji from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py index 3fefce0c4420..1bf3dae0353f 100644 --- a/caffe2/python/modeling/compute_norm_for_blobs_test.py +++ b/caffe2/python/modeling/compute_norm_for_blobs_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/compute_statistics_for_blobs.py b/caffe2/python/modeling/compute_statistics_for_blobs.py index 9a3fbcc96954..588b4a827cb8 100644 --- a/caffe2/python/modeling/compute_statistics_for_blobs.py +++ b/caffe2/python/modeling/compute_statistics_for_blobs.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/compute_statistics_for_blobs_test.py b/caffe2/python/modeling/compute_statistics_for_blobs_test.py index e880f3edacb1..bf75a1f7d149 100644 --- a/caffe2/python/modeling/compute_statistics_for_blobs_test.py +++ b/caffe2/python/modeling/compute_statistics_for_blobs_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py index 88daa226c887..061dfe33991b 100644 --- a/caffe2/python/modeling/get_entry_from_blobs.py +++ b/caffe2/python/modeling/get_entry_from_blobs.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema from caffe2.python.modeling.net_modifier import NetModifier diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py index 8f4fbb864be1..3ec146766f30 100644 --- a/caffe2/python/modeling/get_entry_from_blobs_test.py +++ b/caffe2/python/modeling/get_entry_from_blobs_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/gradient_clipping.py b/caffe2/python/modeling/gradient_clipping.py index 1999ced9ba1b..b01bc2ba301f 100644 --- a/caffe2/python/modeling/gradient_clipping.py +++ b/caffe2/python/modeling/gradient_clipping.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/modeling/gradient_clipping_test.py b/caffe2/python/modeling/gradient_clipping_test.py index ca5c2ba8e22b..0b0e962cb727 100644 --- a/caffe2/python/modeling/gradient_clipping_test.py +++ b/caffe2/python/modeling/gradient_clipping_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import workspace, brew, model_helper diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py index 2053d9e53976..b3e4b1a44dd7 100644 --- a/caffe2/python/modeling/initializers.py +++ b/caffe2/python/modeling/initializers.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.core import DataType, BlobReference, ScopedBlobReference from caffe2.python.modeling.parameter_info import ParameterInfo diff --git a/caffe2/python/modeling/initializers_test.py b/caffe2/python/modeling/initializers_test.py index 0355d1871787..fad40c159b6e 100644 --- a/caffe2/python/modeling/initializers_test.py +++ b/caffe2/python/modeling/initializers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest from caffe2.python import brew, model_helper, workspace diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py index 0f0ac7535c88..e824c828e4bd 100644 --- a/caffe2/python/modeling/net_modifier.py +++ b/caffe2/python/modeling/net_modifier.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import abc import six diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py index 589aa51a7b1c..195048cf91e8 100644 --- a/caffe2/python/modeling/parameter_info.py +++ b/caffe2/python/modeling/parameter_info.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py index 77e5cbd3f8bc..a0174500a413 100644 --- a/caffe2/python/modeling/parameter_sharing.py +++ b/caffe2/python/modeling/parameter_sharing.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py index f616fc1ea6ed..d37e40880c02 100644 --- a/caffe2/python/modeling/parameter_sharing_test.py +++ b/caffe2/python/modeling/parameter_sharing_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, model_helper, scope from caffe2.python.modeling.parameter_sharing import ( diff --git a/caffe2/python/models/__sym_init__.py b/caffe2/python/models/__sym_init__.py index 79f045879ebc..fa10bff7246b 100644 --- a/caffe2/python/models/__sym_init__.py +++ b/caffe2/python/models/__sym_init__.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py index 4b9a570de807..46a9b59f6627 100644 --- a/caffe2/python/models/download.py +++ b/caffe2/python/models/download.py @@ -1,9 +1,9 @@ ## @package download # Module caffe2.python.models.download -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import os import sys diff --git a/caffe2/python/models/imagenet_trainer_test_utils.py b/caffe2/python/models/imagenet_trainer_test_utils.py index 59107336ccd6..fec7708ea150 100644 --- a/caffe2/python/models/imagenet_trainer_test_utils.py +++ b/caffe2/python/models/imagenet_trainer_test_utils.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import time diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py index 41ca087d9637..430d3d335e1e 100644 --- a/caffe2/python/models/resnet.py +++ b/caffe2/python/models/resnet.py @@ -1,9 +1,9 @@ ## @package resnet # Module caffe2.python.models.resnet -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import brew import logging diff --git a/caffe2/python/models/resnet_test.py b/caffe2/python/models/resnet_test.py index ce542e8da046..38d87cefff05 100644 --- a/caffe2/python/models/resnet_test.py +++ b/caffe2/python/models/resnet_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py index 7b909697fb05..6fc9f8ece480 100644 --- a/caffe2/python/models/seq2seq/beam_search.py +++ b/caffe2/python/models/seq2seq/beam_search.py @@ -1,9 +1,9 @@ ## @package beam_search # Module caffe2.python.models.seq2seq.beam_search -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from collections import namedtuple from caffe2.python import core diff --git a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py index 0ee1f6e35ba0..c10d2f1ab4ed 100644 --- a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py +++ b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import os diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py index b2a50c4bd58b..5adabb86fadf 100644 --- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py +++ b/caffe2/python/models/seq2seq/seq2seq_model_helper.py @@ -1,9 +1,9 @@ ## @package seq2seq_model_helper # Module caffe2.python.models.seq2seq.seq2seq_model_helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py index 8095440f2e5a..b70b74d39dc9 100644 --- a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py +++ b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.models.seq2seq import seq2seq_model_helper from caffe2.python import scope, test_util diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py index d0702880c1ec..e1b4224ea4c8 100644 --- a/caffe2/python/models/seq2seq/seq2seq_util.py +++ b/caffe2/python/models/seq2seq/seq2seq_util.py @@ -2,10 +2,10 @@ # Module caffe2.python.examples.seq2seq_util """ A bunch of util functions to build Seq2Seq models with Caffe2.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import collections from future.utils import viewitems diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py index df68e3e30d7b..8080318da4d0 100644 --- a/caffe2/python/models/seq2seq/train.py +++ b/caffe2/python/models/seq2seq/train.py @@ -1,9 +1,9 @@ ## @package train # Module caffe2.python.models.seq2seq.train -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import collections diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py index d2b6a4f6399f..7e77f623e553 100644 --- a/caffe2/python/models/seq2seq/translate.py +++ b/caffe2/python/models/seq2seq/translate.py @@ -1,9 +1,9 @@ ## @package translate # Module caffe2.python.models.seq2seq.translate -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from abc import ABCMeta, abstractmethod import argparse diff --git a/caffe2/python/models/shufflenet.py b/caffe2/python/models/shufflenet.py index c9075a4a1295..33a7f7a4b7c5 100644 --- a/caffe2/python/models/shufflenet.py +++ b/caffe2/python/models/shufflenet.py @@ -1,9 +1,9 @@ # Module caffe2.python.models.shufflenet -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew diff --git a/caffe2/python/models/shufflenet_test.py b/caffe2/python/models/shufflenet_test.py index 344c720b3eb6..6ccfd0a83354 100644 --- a/caffe2/python/models/shufflenet_test.py +++ b/caffe2/python/models/shufflenet_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py index 008e651e41f7..b65d97587549 100644 --- a/caffe2/python/modifier_context.py +++ b/caffe2/python/modifier_context.py @@ -1,9 +1,9 @@ # @package modifier_context # Module caffe2.python.modifier_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + DEFAULT_MODIFIER = 'DEFAULT' diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py index f1af8c3eb521..70dcdec11a58 100644 --- a/caffe2/python/net_builder.py +++ b/caffe2/python/net_builder.py @@ -1,9 +1,9 @@ ## @package net_builder # Module caffe2.python.net_builder -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, context from caffe2.python.task import Task, TaskGroup diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py index 169419c5c17b..bef6caefac3d 100644 --- a/caffe2/python/net_builder_test.py +++ b/caffe2/python/net_builder_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace from caffe2.python.core import Plan, to_execution_step, Net diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py index 1fd0833a718f..b55699c1c095 100644 --- a/caffe2/python/net_drawer.py +++ b/caffe2/python/net_drawer.py @@ -1,9 +1,9 @@ ## @package net_drawer # Module caffe2.python.net_drawer -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import json import logging diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py index 09cde6f76767..8e1d65c01ce7 100644 --- a/caffe2/python/net_printer.py +++ b/caffe2/python/net_printer.py @@ -1,9 +1,9 @@ ## @package net_printer # Module caffe2.python.net_printer -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef from caffe2.python.checkpoint import Job diff --git a/caffe2/python/net_printer_test.py b/caffe2/python/net_printer_test.py index bc086c3eee2a..e71a2b323dea 100644 --- a/caffe2/python/net_printer_test.py +++ b/caffe2/python/net_printer_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import net_printer from caffe2.python.checkpoint import Job diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py index c2f1774c7b2b..2b83e0ec9358 100644 --- a/caffe2/python/nomnigraph.py +++ b/caffe2/python/nomnigraph.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import errno import os diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 6ff47c6d4c9a..3d9adc696486 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, test_util from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/nomnigraph_transformations.py b/caffe2/python/nomnigraph_transformations.py index f4bc2c68bbb6..570c743df152 100644 --- a/caffe2/python/nomnigraph_transformations.py +++ b/caffe2/python/nomnigraph_transformations.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from collections import defaultdict diff --git a/caffe2/python/nomnigraph_transformations_test.py b/caffe2/python/nomnigraph_transformations_test.py index 6c58691db277..adbfe1a4885a 100644 --- a/caffe2/python/nomnigraph_transformations_test.py +++ b/caffe2/python/nomnigraph_transformations_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python import test_util as tu diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py index 1d452c6cbe60..2ca147328c78 100644 --- a/caffe2/python/normalizer.py +++ b/caffe2/python/normalizer.py @@ -1,6 +1,6 @@ # @package optimizer # Module caffe2.python.normalizer -from __future__ import absolute_import, division, print_function, unicode_literals + class Normalizer(object): diff --git a/caffe2/python/normalizer_context.py b/caffe2/python/normalizer_context.py index 57c1052103dc..a85b993b4502 100644 --- a/caffe2/python/normalizer_context.py +++ b/caffe2/python/normalizer_context.py @@ -1,9 +1,9 @@ # @package regularizer_context # Module caffe2.python.normalizer_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context from caffe2.python.modifier_context import ( diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py index 1f4cb4896778..f0ce5099ea75 100644 --- a/caffe2/python/normalizer_test.py +++ b/caffe2/python/normalizer_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python.normalizer_context import UseNormalizer, NormalizerContext from caffe2.python.normalizer import BatchNormalizer diff --git a/caffe2/python/numa_benchmark.py b/caffe2/python/numa_benchmark.py index 21c1cb158da1..a840c6932123 100644 --- a/caffe2/python/numa_benchmark.py +++ b/caffe2/python/numa_benchmark.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py index 692f515abe87..aba6e420ed55 100644 --- a/caffe2/python/numa_test.py +++ b/caffe2/python/numa_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/observer_test.py b/caffe2/python/observer_test.py index 684514d17268..cc3ca1718a5c 100644 --- a/caffe2/python/observer_test.py +++ b/caffe2/python/observer_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 9fe7b23bb7ae..d0f768e42eeb 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -5,10 +5,10 @@ To run this, you will need to have Caffe2 installed as well. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import collections diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py index 27135b35763d..4a75068cfd03 100644 --- a/caffe2/python/onnx/backend_cpp_rep.py +++ b/caffe2/python/onnx/backend_cpp_rep.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.backend_rep_cpp -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from onnx.backend.base import BackendRep, namedtupledict diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py index 13feea3ac8c9..ab97fd562dc1 100644 --- a/caffe2/python/onnx/backend_rep.py +++ b/caffe2/python/onnx/backend_rep.py @@ -1,9 +1,9 @@ # @package onnx # Module caffe2.python.onnx.backend_rep -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py index a30ebdfc3f54..126eef8a8470 100644 --- a/caffe2/python/onnx/bin/conversion.py +++ b/caffe2/python/onnx/bin/conversion.py @@ -1,9 +1,9 @@ ## @package onnx # Module caffe2.python.onnx.bin.conversion -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import json diff --git a/caffe2/python/onnx/error.py b/caffe2/python/onnx/error.py index da72af2cc9b1..1bac8290464d 100644 --- a/caffe2/python/onnx/error.py +++ b/caffe2/python/onnx/error.py @@ -1,8 +1,8 @@ ## @package onnx # Module caffe2.python.onnx.error -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + class BaseException(Exception): pass class Unsupported(BaseException): pass diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index 0fc1c0328093..ee3c30949ff7 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -6,10 +6,10 @@ To run this, you will need to have Caffe2 installed as well. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import itertools import logging diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py index e1d56e1a6766..7f8f1a6d346a 100644 --- a/caffe2/python/onnx/helper.py +++ b/caffe2/python/onnx/helper.py @@ -1,9 +1,9 @@ ## @package onnx # Module caffe2.python.onnx.helper -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from onnx.backend.base import namedtupledict diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py index 6bbd35cd434c..a04e7e4554b9 100644 --- a/caffe2/python/onnx/onnxifi.py +++ b/caffe2/python/onnx/onnxifi.py @@ -5,10 +5,10 @@ ONNXIFI a Caffe2 net """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py index a859b572bae6..7eafccaec9e4 100644 --- a/caffe2/python/onnx/test_onnxifi.py +++ b/caffe2/python/onnx/test_onnxifi.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import json import numpy as np diff --git a/caffe2/python/onnx/tests/__init__.py b/caffe2/python/onnx/tests/__init__.py index e0a02b9d5d83..fd40910d9e70 100644 --- a/caffe2/python/onnx/tests/__init__.py +++ b/caffe2/python/onnx/tests/__init__.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index d909cf828042..d253b06658a3 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -1,10 +1,10 @@ # @package onnx # Module caffe2.python.onnx.tests.c2_ref_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import json import os diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py index 8fa128acd62b..86cdddcd1692 100644 --- a/caffe2/python/onnx/tests/conversion_test.py +++ b/caffe2/python/onnx/tests/conversion_test.py @@ -1,9 +1,9 @@ ## @package onnx # Module caffe2.python.onnx.tests.conversion_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import json import six diff --git a/caffe2/python/onnx/tests/helper_test.py b/caffe2/python/onnx/tests/helper_test.py index e3682780cb04..9000ad94fd9b 100644 --- a/caffe2/python/onnx/tests/helper_test.py +++ b/caffe2/python/onnx/tests/helper_test.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.tests.helper_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py index ad7885fcda74..e4de0a19c07a 100644 --- a/caffe2/python/onnx/tests/onnx_backend_test.py +++ b/caffe2/python/onnx/tests/onnx_backend_test.py @@ -1,10 +1,10 @@ # @package onnx # Module caffe2.python.onnx.tests.onnx_backend_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py index 34f849400e30..d34d4a0e5287 100644 --- a/caffe2/python/onnx/tests/ssa_test.py +++ b/caffe2/python/onnx/tests/ssa_test.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.tests.ssa_test -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import copy import onnx diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py index 1fec938c8e88..d224daf05ba3 100644 --- a/caffe2/python/onnx/tests/test_utils.py +++ b/caffe2/python/onnx/tests/test_utils.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.tests.test_utils -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import os import unittest diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py index a311ec37dfdc..f03e3609fe8b 100644 --- a/caffe2/python/onnx/workspace.py +++ b/caffe2/python/onnx/workspace.py @@ -1,10 +1,10 @@ ## @package onnx # Module caffe2.python.onnx.workspace -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import uuid diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py index 6e08f920a422..3a1ebcd4ec67 100644 --- a/caffe2/python/operator_fp_exceptions_test.py +++ b/caffe2/python/operator_fp_exceptions_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py index 6a7a5ca18ef3..132bee879f6d 100644 --- a/caffe2/python/operator_test/activation_ops_test.py +++ b/caffe2/python/operator_test/activation_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py index 4cb9a54ec664..265d783e6336 100644 --- a/caffe2/python/operator_test/adadelta_test.py +++ b/caffe2/python/operator_test/adadelta_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py index 5ed2d0287e63..55e2f570cf07 100644 --- a/caffe2/python/operator_test/adagrad_test.py +++ b/caffe2/python/operator_test/adagrad_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import functools diff --git a/caffe2/python/operator_test/adagrad_test_helper.py b/caffe2/python/operator_test/adagrad_test_helper.py index 891361e3a879..0fe4aa21f5f9 100644 --- a/caffe2/python/operator_test/adagrad_test_helper.py +++ b/caffe2/python/operator_test/adagrad_test_helper.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from functools import partial diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py index 0d188abc52be..2fb13c149922 100644 --- a/caffe2/python/operator_test/adam_test.py +++ b/caffe2/python/operator_test/adam_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py index 7e37216b82c1..76b09fdd5cd6 100644 --- a/caffe2/python/operator_test/affine_channel_op_test.py +++ b/caffe2/python/operator_test/affine_channel_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/apmeter_test.py b/caffe2/python/operator_test/apmeter_test.py index b7a50ab98e87..1ca26bf64f31 100644 --- a/caffe2/python/operator_test/apmeter_test.py +++ b/caffe2/python/operator_test/apmeter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py index ce800636e6e6..330d17ed6999 100644 --- a/caffe2/python/operator_test/arg_ops_test.py +++ b/caffe2/python/operator_test/arg_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/assert_test.py b/caffe2/python/operator_test/assert_test.py index e3474c0da7a4..2bbca5ab7376 100644 --- a/caffe2/python/operator_test/assert_test.py +++ b/caffe2/python/operator_test/assert_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, settings diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py index 753e76f15319..88e38df52da5 100644 --- a/caffe2/python/operator_test/atomic_ops_test.py +++ b/caffe2/python/operator_test/atomic_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/basic_rnn_test.py b/caffe2/python/operator_test/basic_rnn_test.py index 516c066c6ed8..e863289d488c 100644 --- a/caffe2/python/operator_test/basic_rnn_test.py +++ b/caffe2/python/operator_test/basic_rnn_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core, rnn_cell from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py index 19186220159c..c9306ce1ab07 100644 --- a/caffe2/python/operator_test/batch_box_cox_test.py +++ b/caffe2/python/operator_test/batch_box_cox_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py index fb13b0c08933..82def0572686 100644 --- a/caffe2/python/operator_test/batch_bucketize_op_test.py +++ b/caffe2/python/operator_test/batch_bucketize_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py index c3ee8750225b..12dd72a4160a 100644 --- a/caffe2/python/operator_test/batch_moments_op_test.py +++ b/caffe2/python/operator_test/batch_moments_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py index ef59ed23888f..adfc735c66fd 100644 --- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py +++ b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py index f1ee07c0d1e3..d2584f18af40 100644 --- a/caffe2/python/operator_test/bbox_transform_test.py +++ b/caffe2/python/operator_test/bbox_transform_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py index 77faeaeeb608..147a41282505 100644 --- a/caffe2/python/operator_test/bisect_percentile_op_test.py +++ b/caffe2/python/operator_test/bisect_percentile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py index 6e4c25c77c78..6cf8170b34f8 100644 --- a/caffe2/python/operator_test/blobs_queue_db_test.py +++ b/caffe2/python/operator_test/blobs_queue_db_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 9ccaeaf9e7a7..05b8212242e4 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py index e3bc9f248d3a..8cba2aecf1a4 100644 --- a/caffe2/python/operator_test/boolean_unmask_test.py +++ b/caffe2/python/operator_test/boolean_unmask_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py index bfbe9b7396fa..3131316feefd 100644 --- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py +++ b/caffe2/python/operator_test/box_with_nms_limit_op_test.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py index d1cd6ada7f55..bf9af112a5b0 100644 --- a/caffe2/python/operator_test/bucketize_op_test.py +++ b/caffe2/python/operator_test/bucketize_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, dyndep from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/cast_op_test.py b/caffe2/python/operator_test/cast_op_test.py index f7ffb5b45b47..bf2a210086e6 100644 --- a/caffe2/python/operator_test/cast_op_test.py +++ b/caffe2/python/operator_test/cast_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py index 4e30c915ce2a..e8ee47702445 100644 --- a/caffe2/python/operator_test/ceil_op_test.py +++ b/caffe2/python/operator_test/ceil_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py index 7d614047f48d..7adc5ce24fb7 100644 --- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py +++ b/caffe2/python/operator_test/channel_backprop_stats_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py index d420484bac6b..b821e7b6a43c 100644 --- a/caffe2/python/operator_test/channel_shuffle_test.py +++ b/caffe2/python/operator_test/channel_shuffle_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py index cbef433ae0d3..72eedc479dd6 100644 --- a/caffe2/python/operator_test/channel_stats_op_test.py +++ b/caffe2/python/operator_test/channel_stats_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/checkpoint_test.py b/caffe2/python/operator_test/checkpoint_test.py index 7449ab61f32d..3042e5989764 100644 --- a/caffe2/python/operator_test/checkpoint_test.py +++ b/caffe2/python/operator_test/checkpoint_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, test_util import os diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py index c2d9809c8d80..3304121aab08 100644 --- a/caffe2/python/operator_test/clip_op_test.py +++ b/caffe2/python/operator_test/clip_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py index ee5bd8f73eb3..efc86815bc49 100644 --- a/caffe2/python/operator_test/clip_tensor_op_test.py +++ b/caffe2/python/operator_test/clip_tensor_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py index b5d726d449fc..28e6cd3b3df6 100644 --- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py +++ b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py index bbe0e8eda1c1..1927b4eac78f 100644 --- a/caffe2/python/operator_test/concat_split_op_test.py +++ b/caffe2/python/operator_test/concat_split_op_test.py @@ -1,7 +1,7 @@ -from __future__ import unicode_literals -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py index 88d8fd8b7a27..2e214f089a45 100644 --- a/caffe2/python/operator_test/conditional_test.py +++ b/caffe2/python/operator_test/conditional_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py index ccd78eea4aa3..a240e98fc51e 100644 --- a/caffe2/python/operator_test/conftest.py +++ b/caffe2/python/operator_test/conftest.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 3e24e05191ac..ae54cd37a91d 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import collections import functools diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py index 6bed93226f5b..4fcb6361d0a6 100644 --- a/caffe2/python/operator_test/conv_transpose_test.py +++ b/caffe2/python/operator_test/conv_transpose_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py index 4efec570e812..2b8b756cdf61 100644 --- a/caffe2/python/operator_test/copy_ops_test.py +++ b/caffe2/python/operator_test/copy_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py index 9024ee3edfd1..8e914259bb78 100644 --- a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py +++ b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import logging diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py index 1124df94e67a..04bfbbe6f4f6 100644 --- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py +++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py index 3ebe26415622..d57ff31508c6 100644 --- a/caffe2/python/operator_test/counter_ops_test.py +++ b/caffe2/python/operator_test/counter_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py index d9eb89fc3352..b75e7b7b1a10 100644 --- a/caffe2/python/operator_test/crf_test.py +++ b/caffe2/python/operator_test/crf_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, crf, brew from caffe2.python.model_helper import ModelHelper import numpy as np diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py index 25dc6791fa12..d1852e7dd9e8 100644 --- a/caffe2/python/operator_test/cross_entropy_ops_test.py +++ b/caffe2/python/operator_test/cross_entropy_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py index 21ca68fe007a..1dda7166e65a 100644 --- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.test_util import caffe2_flaky diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py index 0fd38a82b403..8bc7eb47d488 100644 --- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py index 5de901026eb6..db1b826cfe41 100644 --- a/caffe2/python/operator_test/cudnn_recurrent_test.py +++ b/caffe2/python/operator_test/cudnn_recurrent_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import model_helper, workspace, core, rnn_cell from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/operator_test/data_couple_op_test.py b/caffe2/python/operator_test/data_couple_op_test.py index 32cf21e81bbf..d840207159b2 100644 --- a/caffe2/python/operator_test/data_couple_op_test.py +++ b/caffe2/python/operator_test/data_couple_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py index 138ac90e68c8..96d93dc5effb 100644 --- a/caffe2/python/operator_test/dataset_ops_test.py +++ b/caffe2/python/operator_test/dataset_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace, dataset from caffe2.python.dataset import Const diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py index 31e407499063..f6ad0e38e73c 100644 --- a/caffe2/python/operator_test/deform_conv_test.py +++ b/caffe2/python/operator_test/deform_conv_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import os import unittest diff --git a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py index aea30d890416..8b6f42417fd4 100644 --- a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py +++ b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py index af431f1f07d4..2d6d6429f833 100644 --- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py +++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/detectron_keypoints.py b/caffe2/python/operator_test/detectron_keypoints.py index 2f34349beae4..1abff0675993 100644 --- a/caffe2/python/operator_test/detectron_keypoints.py +++ b/caffe2/python/operator_test/detectron_keypoints.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + try: import cv2 diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py index 753b94d20f1f..e948fdae9673 100644 --- a/caffe2/python/operator_test/distance_op_test.py +++ b/caffe2/python/operator_test/distance_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index c8c46127e4d9..84c2f7e35f56 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import assume, given, settings import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/duplicate_operands_test.py b/caffe2/python/operator_test/duplicate_operands_test.py index 385e69fded4c..179b42dbabc8 100644 --- a/caffe2/python/operator_test/duplicate_operands_test.py +++ b/caffe2/python/operator_test/duplicate_operands_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py index 8c7df5f33625..ac0dc3dd0975 100644 --- a/caffe2/python/operator_test/elementwise_linear_op_test.py +++ b/caffe2/python/operator_test/elementwise_linear_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py index e35b4a483c6d..3195d969dee5 100644 --- a/caffe2/python/operator_test/elementwise_logical_ops_test.py +++ b/caffe2/python/operator_test/elementwise_logical_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index ef9c1b9c8cf3..605c1d741271 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index ca2b847f088c..ed7a09eb0857 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, assume, settings diff --git a/caffe2/python/operator_test/emptysample_ops_test.py b/caffe2/python/operator_test/emptysample_ops_test.py index a04e9d0e161d..0f728b723163 100644 --- a/caffe2/python/operator_test/emptysample_ops_test.py +++ b/caffe2/python/operator_test/emptysample_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py index c8c12e240946..b843bfdc95b9 100644 --- a/caffe2/python/operator_test/enforce_finite_op_test.py +++ b/caffe2/python/operator_test/enforce_finite_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import numpy as np diff --git a/caffe2/python/operator_test/ensure_clipped_test.py b/caffe2/python/operator_test/ensure_clipped_test.py index 8d3c638e1ba1..a89718745b1c 100644 --- a/caffe2/python/operator_test/ensure_clipped_test.py +++ b/caffe2/python/operator_test/ensure_clipped_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/ensure_cpu_output_op_test.py b/caffe2/python/operator_test/ensure_cpu_output_op_test.py index 509c28a5a8bb..4812ee3042e0 100644 --- a/caffe2/python/operator_test/ensure_cpu_output_op_test.py +++ b/caffe2/python/operator_test/ensure_cpu_output_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import numpy as np diff --git a/caffe2/python/operator_test/erf_op_test.py b/caffe2/python/operator_test/erf_op_test.py index 5761c8409bd3..64714db4315c 100644 --- a/caffe2/python/operator_test/erf_op_test.py +++ b/caffe2/python/operator_test/erf_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import math diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py index 4be96208fbba..0d198b1aff14 100644 --- a/caffe2/python/operator_test/expand_op_test.py +++ b/caffe2/python/operator_test/expand_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py index c08596f8717d..1e8b5522053d 100644 --- a/caffe2/python/operator_test/fc_operator_test.py +++ b/caffe2/python/operator_test/fc_operator_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py index 1d64b19b993f..19fa329c9389 100644 --- a/caffe2/python/operator_test/feature_maps_ops_test.py +++ b/caffe2/python/operator_test/feature_maps_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, dyndep from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py index 4a2d9419d7bc..e080dde3eb5f 100644 --- a/caffe2/python/operator_test/filler_ops_test.py +++ b/caffe2/python/operator_test/filler_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py index c6d2856c3514..fc25913d8744 100644 --- a/caffe2/python/operator_test/find_op_test.py +++ b/caffe2/python/operator_test/find_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/flatten_op_test.py b/caffe2/python/operator_test/flatten_op_test.py index 19d204e0bded..2e0340c68779 100644 --- a/caffe2/python/operator_test/flatten_op_test.py +++ b/caffe2/python/operator_test/flatten_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import numpy as np diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py index 9542ecd30691..3e0e5722b0ce 100644 --- a/caffe2/python/operator_test/flexible_top_k_test.py +++ b/caffe2/python/operator_test/flexible_top_k_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py index 5877cb6da4e8..8c0974bb8579 100644 --- a/caffe2/python/operator_test/floor_op_test.py +++ b/caffe2/python/operator_test/floor_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py index ecabe7d29ef0..12d0b0265afb 100644 --- a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py +++ b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import math import struct diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py index 09225385191a..e9af40a128a6 100644 --- a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py +++ b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py index 967131de38d8..fc23be13fdae 100644 --- a/caffe2/python/operator_test/gather_ops_test.py +++ b/caffe2/python/operator_test/gather_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py index 19d538c60556..c0d73af33601 100644 --- a/caffe2/python/operator_test/gather_ranges_op_test.py +++ b/caffe2/python/operator_test/gather_ranges_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py index 3b1b4bf86515..7dea8f308783 100644 --- a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py +++ b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py index bcd277cf258b..3d929ce5c0ee 100644 --- a/caffe2/python/operator_test/given_tensor_fill_op_test.py +++ b/caffe2/python/operator_test/given_tensor_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py index f70c0739ded8..f38df09ec9fb 100644 --- a/caffe2/python/operator_test/glu_op_test.py +++ b/caffe2/python/operator_test/glu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py index 1d46888e791a..62aba236d5ba 100644 --- a/caffe2/python/operator_test/group_conv_test.py +++ b/caffe2/python/operator_test/group_conv_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py index d17998c32986..14300beed3f9 100644 --- a/caffe2/python/operator_test/group_norm_op_test.py +++ b/caffe2/python/operator_test/group_norm_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py index 99da7a3f5626..99444f39ac26 100644 --- a/caffe2/python/operator_test/gru_test.py +++ b/caffe2/python/operator_test/gru_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core, scope, gru_cell from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py index ae8c1dc22799..e683a04d7998 100644 --- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py +++ b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import torch diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py index f2321adc8e01..245bca210ad9 100644 --- a/caffe2/python/operator_test/hsm_test.py +++ b/caffe2/python/operator_test/hsm_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import numpy as np import unittest diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py index 9fdf0cabb0bd..90a8197e7ccf 100644 --- a/caffe2/python/operator_test/hyperbolic_ops_test.py +++ b/caffe2/python/operator_test/hyperbolic_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py index 98e9d61b5bd0..760228382bc6 100644 --- a/caffe2/python/operator_test/im2col_col2im_test.py +++ b/caffe2/python/operator_test/im2col_col2im_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py index 79acc60739f1..0de1f0ad048b 100644 --- a/caffe2/python/operator_test/image_input_op_test.py +++ b/caffe2/python/operator_test/image_input_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest try: diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py index f7c6d0cdc14a..1eb7ffa20691 100644 --- a/caffe2/python/operator_test/index_hash_ops_test.py +++ b/caffe2/python/operator_test/index_hash_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py index 642f340fad80..cf021f59362b 100644 --- a/caffe2/python/operator_test/index_ops_test.py +++ b/caffe2/python/operator_test/index_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py index e57b8a8e11d8..fb4f3c935ba8 100644 --- a/caffe2/python/operator_test/instance_norm_test.py +++ b/caffe2/python/operator_test/instance_norm_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, assume, settings diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py index 212f807addcf..79d79ae6de21 100644 --- a/caffe2/python/operator_test/integral_image_ops_test.py +++ b/caffe2/python/operator_test/integral_image_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py index 51faa14b9029..6ed2db2e88c2 100644 --- a/caffe2/python/operator_test/jsd_ops_test.py +++ b/caffe2/python/operator_test/jsd_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/key_split_ops_test.py b/caffe2/python/operator_test/key_split_ops_test.py index be38ee38926f..18fddff58d17 100644 --- a/caffe2/python/operator_test/key_split_ops_test.py +++ b/caffe2/python/operator_test/key_split_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/lars_test.py b/caffe2/python/operator_test/lars_test.py index e2f02b29d26f..6f976520e06b 100644 --- a/caffe2/python/operator_test/lars_test.py +++ b/caffe2/python/operator_test/lars_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index 89ba4b2017bd..62e94afe9e7d 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import brew, core, workspace from caffe2.python.model_helper import ModelHelper @@ -373,6 +373,34 @@ def test_layer_norm_brew_wrapper(self, X, gc, dc): self.ws.create_net(model.param_init_net).run() self.ws.create_net(model.net).run() + @given(N=st.integers(1, 10), elementwise_affine=st.booleans(), **hu.gcs) + @settings(deadline=None) + def test_layer_norm_with_empty_batch(self, N, elementwise_affine, gc, dc): + X = np.random.randn(0, N).astype(np.float32) + gamma = np.random.rand(N).astype(np.float32) + beta = np.random.rand(N).astype(np.float32) + + op = core.CreateOperator( + "LayerNorm", + ["X", "gamma", "beta"] if elementwise_affine else ["X"], + ["Y", "mean", "sigma"], + elementwise_affine=elementwise_affine, + ) + + def ref(X, gamma=None, beta=None): + Y = np.zeros_like(X) + axis = 1 + mean = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype) + sigma = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype) + return Y, mean, sigma + + + inputs = [X, gamma, beta] if elementwise_affine else [X] + self.assertReferenceChecks(gc, op, inputs, ref) + self.assertDeviceChecks(dc, op, inputs, [0, 1]) + for i in range(len(inputs)): + self.assertGradientChecks(gc, op, inputs, i, [0]) + if __name__ == "__main__": unittest.main() diff --git a/caffe2/python/operator_test/leaky_relu_test.py b/caffe2/python/operator_test/leaky_relu_test.py index 2eaa782eeefd..9a888cac7901 100644 --- a/caffe2/python/operator_test/leaky_relu_test.py +++ b/caffe2/python/operator_test/leaky_relu_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, assume diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py index 3a5d44663771..1891171b80d8 100644 --- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py +++ b/caffe2/python/operator_test/learning_rate_adaption_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py index 1a1f9eb8c842..8d17c0c7ef08 100644 --- a/caffe2/python/operator_test/learning_rate_op_test.py +++ b/caffe2/python/operator_test/learning_rate_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu @@ -50,7 +50,7 @@ def ref(iter): def test_hill_learning_rate_op(self, gc, dc): iter = np.random.randint(low=1, high=1e5, size=1) - num_iter = int(np.random.randint(low=1e2, high=1e3, size=1)) + num_iter = int(np.random.randint(low=1e2, high=1e8, size=1)) start_multiplier = 1e-4 gamma = 1.0 power = 0.5 diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py index fa3ac0826230..28d7134ac5e8 100644 --- a/caffe2/python/operator_test/length_split_op_test.py +++ b/caffe2/python/operator_test/length_split_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py index d9cd2b244604..626ec0542b7d 100644 --- a/caffe2/python/operator_test/lengths_pad_op_test.py +++ b/caffe2/python/operator_test/lengths_pad_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py index 88c99c3da337..fc4e89e2545b 100644 --- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py +++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index 4a9a6b0ff1a9..e0a5f9609588 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py index 8bc27c31144f..b8b082a02125 100644 --- a/caffe2/python/operator_test/lengths_top_k_ops_test.py +++ b/caffe2/python/operator_test/lengths_top_k_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py index 8f4f680de109..c08f1180a920 100644 --- a/caffe2/python/operator_test/listwise_l2r_operator_test.py +++ b/caffe2/python/operator_test/listwise_l2r_operator_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py index a5e28479cf10..845bafee4702 100644 --- a/caffe2/python/operator_test/load_save_test.py +++ b/caffe2/python/operator_test/load_save_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import errno import hypothesis.strategies as st from hypothesis import given, assume, settings diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py index cfd49b8a7eb8..6eb3181ea9ad 100644 --- a/caffe2/python/operator_test/locally_connected_op_test.py +++ b/caffe2/python/operator_test/locally_connected_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import given, settings, assume diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py index e57bdb7a1d41..24cb65ac96f8 100644 --- a/caffe2/python/operator_test/loss_ops_test.py +++ b/caffe2/python/operator_test/loss_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py index 1fcacc4f26f8..3a58cbe6d960 100644 --- a/caffe2/python/operator_test/lpnorm_op_test.py +++ b/caffe2/python/operator_test/lpnorm_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/map_ops_test.py b/caffe2/python/operator_test/map_ops_test.py index add86a3a467e..dcc8b295f7c3 100644 --- a/caffe2/python/operator_test/map_ops_test.py +++ b/caffe2/python/operator_test/map_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import itertools import numpy as np diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py index 354aed27aaf4..e28dd1ce28f8 100644 --- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py +++ b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py index e18025ffb92d..4849b83648f8 100644 --- a/caffe2/python/operator_test/math_ops_test.py +++ b/caffe2/python/operator_test/math_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py index fababb13c54a..b8cef19b24df 100644 --- a/caffe2/python/operator_test/matmul_op_test.py +++ b/caffe2/python/operator_test/matmul_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import inspect diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py index 77c6b82625b1..5830089f8e9b 100644 --- a/caffe2/python/operator_test/mean_op_test.py +++ b/caffe2/python/operator_test/mean_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py index 9f3302c6e75a..36b765557505 100644 --- a/caffe2/python/operator_test/merge_id_lists_op_test.py +++ b/caffe2/python/operator_test/merge_id_lists_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/mkl_conv_op_test.py b/caffe2/python/operator_test/mkl_conv_op_test.py index b72848b9a422..595debf977fe 100644 --- a/caffe2/python/operator_test/mkl_conv_op_test.py +++ b/caffe2/python/operator_test/mkl_conv_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/mkl_packed_fc_op_test.py b/caffe2/python/operator_test/mkl_packed_fc_op_test.py index 59546d3891e9..2f889d693444 100644 --- a/caffe2/python/operator_test/mkl_packed_fc_op_test.py +++ b/caffe2/python/operator_test/mkl_packed_fc_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py index 92a318f3f10f..914bffd2067c 100644 --- a/caffe2/python/operator_test/mod_op_test.py +++ b/caffe2/python/operator_test/mod_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py index ae9d9158f506..3b270df254ce 100644 --- a/caffe2/python/operator_test/moments_op_test.py +++ b/caffe2/python/operator_test/moments_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py index a37e27141bd0..58f16e87a21c 100644 --- a/caffe2/python/operator_test/momentum_sgd_test.py +++ b/caffe2/python/operator_test/momentum_sgd_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py index 0885289c7c1a..bb111a125fc0 100644 --- a/caffe2/python/operator_test/mpi_test.py +++ b/caffe2/python/operator_test/mpi_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/mul_gradient_benchmark.py b/caffe2/python/operator_test/mul_gradient_benchmark.py index 721676239409..2e11aefcb497 100644 --- a/caffe2/python/operator_test/mul_gradient_benchmark.py +++ b/caffe2/python/operator_test/mul_gradient_benchmark.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse import numpy as np diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py index 14ca954d363f..137be1eece34 100644 --- a/caffe2/python/operator_test/negate_gradient_op_test.py +++ b/caffe2/python/operator_test/negate_gradient_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/ngram_ops_test.py b/caffe2/python/operator_test/ngram_ops_test.py index 70aad5cab814..3f4e57fa230b 100644 --- a/caffe2/python/operator_test/ngram_ops_test.py +++ b/caffe2/python/operator_test/ngram_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py index 46f88a1de079..7a35e0bafa31 100644 --- a/caffe2/python/operator_test/normalize_op_test.py +++ b/caffe2/python/operator_test/normalize_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import functools diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py index 398b0d4b93ab..a202581f808c 100644 --- a/caffe2/python/operator_test/numpy_tile_op_test.py +++ b/caffe2/python/operator_test/numpy_tile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py index d0b7a08ee706..593d5b5aa58c 100644 --- a/caffe2/python/operator_test/one_hot_ops_test.py +++ b/caffe2/python/operator_test/one_hot_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py index 811e38e34af7..4cff53b87d6e 100644 --- a/caffe2/python/operator_test/onnx_while_test.py +++ b/caffe2/python/operator_test/onnx_while_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py index 3777fdd7695d..7b3f40a27c97 100644 --- a/caffe2/python/operator_test/order_switch_test.py +++ b/caffe2/python/operator_test/order_switch_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py index 84f3f46a6dc1..698fbb76df88 100644 --- a/caffe2/python/operator_test/pack_ops_test.py +++ b/caffe2/python/operator_test/pack_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py index 6bf2315ca0c5..9a76e6b847a5 100644 --- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py +++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py index 43cd10c23188..6d4e6bbdcd08 100644 --- a/caffe2/python/operator_test/pad_test.py +++ b/caffe2/python/operator_test/pad_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py index a5a7db12b1ef..b600c302d83b 100644 --- a/caffe2/python/operator_test/partition_ops_test.py +++ b/caffe2/python/operator_test/partition_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace from caffe2.python.test_util import TestCase, rand_array diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py index 54c42bf63917..d81b0a963185 100644 --- a/caffe2/python/operator_test/percentile_op_test.py +++ b/caffe2/python/operator_test/percentile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace, dyndep import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py index 463380306ce4..d7c4e0df4416 100644 --- a/caffe2/python/operator_test/piecewise_linear_transform_test.py +++ b/caffe2/python/operator_test/piecewise_linear_transform_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py index 743cee5cef3c..7ef98249bd79 100644 --- a/caffe2/python/operator_test/pooling_test.py +++ b/caffe2/python/operator_test/pooling_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py index 6cf8e7a81b5e..d794ba2162b9 100644 --- a/caffe2/python/operator_test/prepend_dim_test.py +++ b/caffe2/python/operator_test/prepend_dim_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/python_op_test.py b/caffe2/python/operator_test/python_op_test.py index 7467c8c3900c..b071070151d1 100644 --- a/caffe2/python/operator_test/python_op_test.py +++ b/caffe2/python/operator_test/python_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.core import CreatePythonOperator import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/quantile_test.py b/caffe2/python/operator_test/quantile_test.py index 6a4250d06183..39f3728d8e81 100644 --- a/caffe2/python/operator_test/quantile_test.py +++ b/caffe2/python/operator_test/quantile_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import unittest diff --git a/caffe2/python/operator_test/rand_quantization_op_speed_test.py b/caffe2/python/operator_test/rand_quantization_op_speed_test.py index ce0e84028541..1c56faff645f 100644 --- a/caffe2/python/operator_test/rand_quantization_op_speed_test.py +++ b/caffe2/python/operator_test/rand_quantization_op_speed_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import time diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py index 811a20505a3c..e244f77149e1 100644 --- a/caffe2/python/operator_test/rand_quantization_op_test.py +++ b/caffe2/python/operator_test/rand_quantization_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import struct diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py index 94220d76762d..2d52da293127 100644 --- a/caffe2/python/operator_test/rank_loss_operator_test.py +++ b/caffe2/python/operator_test/rank_loss_operator_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py index 930fad30d663..53d3fd4f4ecc 100644 --- a/caffe2/python/operator_test/rebatching_queue_test.py +++ b/caffe2/python/operator_test/rebatching_queue_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/record_queue_test.py b/caffe2/python/operator_test/record_queue_test.py index d32b3e794ab4..00e47ed1cb68 100644 --- a/caffe2/python/operator_test/record_queue_test.py +++ b/caffe2/python/operator_test/record_queue_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.dataset import Dataset from caffe2.python.schema import ( diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py index 24bd0122f4fb..5d9b83604423 100644 --- a/caffe2/python/operator_test/recurrent_net_executor_test.py +++ b/caffe2/python/operator_test/recurrent_net_executor_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import model_helper, workspace, core, rnn_cell, test_util diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py index 7cf79edfafed..13650e6cad4e 100644 --- a/caffe2/python/operator_test/recurrent_network_test.py +++ b/caffe2/python/operator_test/recurrent_network_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import recurrent, workspace from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py index ffb5e8a02667..727631befe89 100644 --- a/caffe2/python/operator_test/reduce_ops_test.py +++ b/caffe2/python/operator_test/reduce_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py index 018024900281..7d4287df6609 100644 --- a/caffe2/python/operator_test/reduction_ops_test.py +++ b/caffe2/python/operator_test/reduction_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py index 9c57ed4f3090..a42f00bbf82f 100644 --- a/caffe2/python/operator_test/reshape_ops_test.py +++ b/caffe2/python/operator_test/reshape_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import six from numpy.testing import assert_array_equal diff --git a/caffe2/python/operator_test/resize_op_test.py b/caffe2/python/operator_test/resize_op_test.py index 893e09cf6443..cd90656f607d 100644 --- a/caffe2/python/operator_test/resize_op_test.py +++ b/caffe2/python/operator_test/resize_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/rmac_regions_op_test.py b/caffe2/python/operator_test/rmac_regions_op_test.py index 856832c34b99..084d7402df5f 100644 --- a/caffe2/python/operator_test/rmac_regions_op_test.py +++ b/caffe2/python/operator_test/rmac_regions_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/rms_norm_op_test.py b/caffe2/python/operator_test/rms_norm_op_test.py index f5a35701877c..797b3c9a01c3 100644 --- a/caffe2/python/operator_test/rms_norm_op_test.py +++ b/caffe2/python/operator_test/rms_norm_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py index 64cd7bf48913..8fe037ccb70c 100644 --- a/caffe2/python/operator_test/rnn_cell_test.py +++ b/caffe2/python/operator_test/rnn_cell_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import ( core, gradient_checker, rnn_cell, workspace, scope, utils diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py index 0487d962e6fb..c74157a039b0 100644 --- a/caffe2/python/operator_test/roi_align_rotated_op_test.py +++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/rowwise_counter_test.py b/caffe2/python/operator_test/rowwise_counter_test.py index a00dd24b3f2c..a9dacc5a6d86 100644 --- a/caffe2/python/operator_test/rowwise_counter_test.py +++ b/caffe2/python/operator_test/rowwise_counter_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import unittest diff --git a/caffe2/python/operator_test/scale_op_test.py b/caffe2/python/operator_test/scale_op_test.py index 14e17dc2c5d5..b5507e2013fa 100644 --- a/caffe2/python/operator_test/scale_op_test.py +++ b/caffe2/python/operator_test/scale_op_test.py @@ -1,7 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py index 01c415eac953..f991a7dde211 100644 --- a/caffe2/python/operator_test/segment_ops_test.py +++ b/caffe2/python/operator_test/segment_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from functools import partial from hypothesis import given, settings diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py index fc903f159a4e..4dd2fa1848bf 100644 --- a/caffe2/python/operator_test/selu_op_test.py +++ b/caffe2/python/operator_test/selu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 720bf9f02030..4609473f91f0 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from functools import partial diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py index aca6ff38a517..702effc226d6 100644 --- a/caffe2/python/operator_test/shape_inference_test.py +++ b/caffe2/python/operator_test/shape_inference_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py index a925783c206e..6e8cae62dbff 100644 --- a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py +++ b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py index f0f6c22cd10b..3ae26de6b513 100644 --- a/caffe2/python/operator_test/softmax_ops_test.py +++ b/caffe2/python/operator_test/softmax_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/softplus_op_test.py b/caffe2/python/operator_test/softplus_op_test.py index ac28a1a9a51e..dd183b774f92 100644 --- a/caffe2/python/operator_test/softplus_op_test.py +++ b/caffe2/python/operator_test/softplus_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py index 14d637f50f41..2ba21bb6d44f 100644 --- a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py +++ b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py index 9bdae01d1318..f1f85b1f9bec 100644 --- a/caffe2/python/operator_test/sparse_gradient_checker_test.py +++ b/caffe2/python/operator_test/sparse_gradient_checker_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from scipy.sparse import coo_matrix diff --git a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py index 74690c8a2c56..fb958492cfa9 100644 --- a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py +++ b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import argparse import datetime diff --git a/caffe2/python/operator_test/sparse_lp_regularizer_test.py b/caffe2/python/operator_test/sparse_lp_regularizer_test.py index b0d0b4b5c9b3..7ea32bd69a29 100644 --- a/caffe2/python/operator_test/sparse_lp_regularizer_test.py +++ b/caffe2/python/operator_test/sparse_lp_regularizer_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis from hypothesis import given, settings, HealthCheck diff --git a/caffe2/python/operator_test/sparse_normalize_test.py b/caffe2/python/operator_test/sparse_normalize_test.py index bd8dbd5f7b53..ecc4ae0c8d22 100644 --- a/caffe2/python/operator_test/sparse_normalize_test.py +++ b/caffe2/python/operator_test/sparse_normalize_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis from hypothesis import given, settings, HealthCheck diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py index 1cf243ed05c4..089174007b18 100644 --- a/caffe2/python/operator_test/sparse_ops_test.py +++ b/caffe2/python/operator_test/sparse_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.test_util import rand_array diff --git a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py index 03deb62d8513..41ec8808bb6a 100644 --- a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py +++ b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py index 1186161e5f46..35f7bd2a5e29 100644 --- a/caffe2/python/operator_test/spatial_bn_op_test.py +++ b/caffe2/python/operator_test/spatial_bn_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import brew, core, utils, workspace diff --git a/caffe2/python/operator_test/specialized_segment_ops_test.py b/caffe2/python/operator_test/specialized_segment_ops_test.py index fe768e193c88..4f1842ac4664 100644 --- a/caffe2/python/operator_test/specialized_segment_ops_test.py +++ b/caffe2/python/operator_test/specialized_segment_ops_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import unittest diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py index 172c6cbafa16..5bd6cb1d08f8 100644 --- a/caffe2/python/operator_test/square_root_divide_op_test.py +++ b/caffe2/python/operator_test/square_root_divide_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from functools import partial diff --git a/caffe2/python/operator_test/stats_ops_test.py b/caffe2/python/operator_test/stats_ops_test.py index edc36facb236..6114dfed3b10 100644 --- a/caffe2/python/operator_test/stats_ops_test.py +++ b/caffe2/python/operator_test/stats_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py index 0a42d5d23728..12a9e6826fd1 100644 --- a/caffe2/python/operator_test/stats_put_ops_test.py +++ b/caffe2/python/operator_test/stats_put_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/storm_test.py b/caffe2/python/operator_test/storm_test.py index 2ae402a8a290..c97f631d2160 100644 --- a/caffe2/python/operator_test/storm_test.py +++ b/caffe2/python/operator_test/storm_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py index 969e8c7e11e5..eedb57be1d6c 100644 --- a/caffe2/python/operator_test/string_ops_test.py +++ b/caffe2/python/operator_test/string_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/text_file_reader_test.py b/caffe2/python/operator_test/text_file_reader_test.py index 41ba814af6ab..8889ddb9f53c 100644 --- a/caffe2/python/operator_test/text_file_reader_test.py +++ b/caffe2/python/operator_test/text_file_reader_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.text_file_reader import TextFileReader from caffe2.python.test_util import TestCase diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py index 9c103c85c03c..0cd5c0f77895 100644 --- a/caffe2/python/operator_test/thresholded_relu_op_test.py +++ b/caffe2/python/operator_test/thresholded_relu_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py index 51471f797b34..d39dfeee0ad7 100644 --- a/caffe2/python/operator_test/tile_op_test.py +++ b/caffe2/python/operator_test/tile_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py index 85cf902812ee..fa628456c3a4 100644 --- a/caffe2/python/operator_test/top_k_test.py +++ b/caffe2/python/operator_test/top_k_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py index a1ddbaa9509e..82ebd2d65f49 100644 --- a/caffe2/python/operator_test/torch_integration_test.py +++ b/caffe2/python/operator_test/torch_integration_test.py @@ -1,12 +1,12 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + + +import struct +import unittest import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st import numpy as np -import struct import torch -import unittest - from caffe2.python import core, workspace from hypothesis import given, settings from scipy.stats import norm @@ -77,7 +77,7 @@ def create_bbox_transform_inputs(roi_counts, num_classes, rotated): def bytes_to_floats(byte_matrix): floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float32) for i, byte_values in enumerate(byte_matrix): - floats[i], = struct.unpack('f', bytearray(byte_values)) + (floats[i],) = struct.unpack("f", bytearray(byte_values)) return floats @@ -85,7 +85,7 @@ def floats_to_bytes(floats): byte_matrix = np.empty([np.shape(floats)[0], 4], dtype=np.uint8) for i, value in enumerate(floats): assert isinstance(value, np.float32), (value, floats) - as_bytes = struct.pack('f', value) + as_bytes = struct.pack("f", value) # In Python3 bytes will be a list of int, in Python2 a list of string if isinstance(as_bytes[0], int): byte_matrix[i] = list(as_bytes) @@ -268,6 +268,69 @@ def box_with_nms_limit_ref(): for o, o_ref in zip(outputs, output_refs): torch.testing.assert_allclose(o, o_ref) + @given( + dim_1=st.integers(min_value=10, max_value=10), + dim_2=st.integers(min_value=3, max_value=3), + dim_3=st.integers(min_value=2, max_value=2), + ) + def test_sparse_to_dense_mask(self, dim_1, dim_2, dim_3): + indices = np.array([i + 1 for i in range(dim_1)]).astype(np.int32) + values = np.random.rand(dim_1, dim_2, dim_3).astype(np.float32) + default_value = np.zeros((dim_2, dim_3)).astype(np.float32) + mask = [2, 4, 9] + + def sparse_to_dense_mask_ref(return_presence_mask=False): + ref_op = core.CreateOperator( + "SparseToDenseMask", + ["indices", "values", "default_value"], + ["output", "presence_mask"], + mask=mask, + return_presence_mask=return_presence_mask, + ) + workspace.FeedBlob("indices", indices) + workspace.FeedBlob("values", values) + workspace.FeedBlob("default_value", default_value) + workspace.RunOperatorOnce(ref_op) + + if return_presence_mask: + return ( + workspace.FetchBlob("output"), + workspace.FetchBlob("presence_mask"), + ) + + return workspace.FetchBlob("output") + + # Testing return_presence_mask = False + output = sparse_to_dense_mask_ref() + output = torch.tensor(output) + + a, _ = torch.ops._caffe2.SparseToDenseMask( + torch.tensor(indices), + torch.tensor(values), + torch.tensor(default_value), + None, + mask=mask, + ) + + torch.testing.assert_allclose(output, a) + + # Testing return_presence_mask = True + output, presence_mask = sparse_to_dense_mask_ref(return_presence_mask=True) + output = torch.tensor(output) + presence_mask = torch.tensor(presence_mask) + + a, b = torch.ops._caffe2.SparseToDenseMask( + torch.tensor(indices), + torch.tensor(values), + torch.tensor(default_value), + None, + mask=mask, + return_presence_mask=True, + ) + + torch.testing.assert_allclose(output, a) + torch.testing.assert_allclose(presence_mask, b) + @given( A=st.integers(min_value=4, max_value=4), H=st.integers(min_value=10, max_value=10), @@ -380,7 +443,7 @@ def inference_lstm_ref(): return ( workspace.FetchBlob("output"), workspace.FetchBlob("hidden"), - workspace.FetchBlob("cell") + workspace.FetchBlob("cell"), ) output, hidden, cell = inference_lstm_ref() @@ -526,7 +589,7 @@ def rand_rotated_roi(): np.random.rand() * H, np.random.rand() * W, np.random.rand() * H, - np.random.rand() * 360 - 180 + np.random.rand() * 360 - 180, ] ).astype(np.float32) @@ -613,18 +676,19 @@ def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts): for x, y in zip(fpn_outputs, all_outputs[1:]): torch.testing.assert_allclose(x, y) - @given(X=hu.tensor(), - fast_gelu=st.booleans()) + @given(X=hu.tensor(), fast_gelu=st.booleans()) def _test_gelu_op(self, X, fast_gelu, device): def _gelu_ref(_X): - return (_X * norm.cdf(_X).astype(np.float32), ) - expected_output, = _gelu_ref(X) + return (_X * norm.cdf(_X).astype(np.float32),) + + (expected_output,) = _gelu_ref(X) actual_output = torch.ops._caffe2.Gelu(torch.tensor(X), fast_gelu) rtol = 1e-3 if fast_gelu else 1e-4 atol = 1e-5 torch.testing.assert_allclose( - expected_output, actual_output.cpu(), rtol=rtol, atol=atol) + expected_output, actual_output.cpu(), rtol=rtol, atol=atol + ) def test_gelu_op(self): self._test_gelu_op(device="cpu") @@ -633,13 +697,11 @@ def test_gelu_op(self): def test_gelu_op_cuda(self): self._test_gelu_op(device="cuda") - - @given(inputs=hu.lengths_tensor( - dtype=np.float32, - min_value=1, - max_value=5, - allow_empty=True, - )) + @given( + inputs=hu.lengths_tensor( + dtype=np.float32, min_value=1, max_value=5, allow_empty=True + ) + ) def _test_lengths_op(self, inputs, ref_op_name, torch_op, device): data, lengths = inputs @@ -652,7 +714,8 @@ def _lengths_ref(X, Y): expected_output = _lengths_ref(data, lengths) actual_output = torch_op( - torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)) + torch.tensor(data), torch.tensor(lengths, dtype=torch.int32) + ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) @@ -691,8 +754,12 @@ def _test_resize_nearest_op(self, device): def _resize_nearest_ref(X): ref_op = core.CreateOperator( - "ResizeNearest", ["X"], ["Y"], - width_scale=2.0, height_scale=1.5, order="NCHW", + "ResizeNearest", + ["X"], + ["Y"], + width_scale=2.0, + height_scale=1.5, + order="NCHW", ) workspace.FeedBlob("X", X) workspace.RunOperatorOnce(ref_op) @@ -701,7 +768,9 @@ def _resize_nearest_ref(X): expected_output = _resize_nearest_ref(data) actual_output = torch.ops._caffe2.ResizeNearest( torch.tensor(data).to(device), - order="NCHW", width_scale=2.0, height_scale=1.5, + order="NCHW", + width_scale=2.0, + height_scale=1.5, ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) @@ -716,9 +785,7 @@ def test_resize_nearest_op_cuda(self): @given(input_data=hu.tensor(min_dim=2, max_dim=2)) def test_Fused8BitRowwiseQuantizedToFloat(self, input_data): QuantizeOp = core.CreateOperator( - "FloatToFused8BitRowwiseQuantized", - ["input_data"], - ["quantized_data"], + "FloatToFused8BitRowwiseQuantized", ["input_data"], ["quantized_data"] ) workspace.FeedBlob("input_data", input_data) @@ -741,16 +808,15 @@ def test_piecewise_linear_op(self, binary_input): num_dims = 3 data = np.random.rand(1024, num_dims).astype(np.float32) slopes = np.zeros(4 * num_dims).astype(np.float32) - bounds = np.sort(np.random.rand(5, num_dims).astype(np.float32), axis=0).flatten('F') + bounds = np.sort( + np.random.rand(5, num_dims).astype(np.float32), axis=0 + ).flatten("F") intercepts = np.random.rand(4 * num_dims).astype(np.float32) def _piecewise_linear_ref(X): ref_op = core.CreateOperator( "PiecewiseLinearTransform", - ["data", - "bounds", - "slopes", - "intercepts"], + ["data", "bounds", "slopes", "intercepts"], ["calibrated"], binary=binary_input, ) @@ -763,7 +829,12 @@ def _piecewise_linear_ref(X): expected_output = _piecewise_linear_ref(data) actual_output = torch.ops._caffe2.PiecewiseLinearTransform( - torch.tensor(data), bounds.tolist(), slopes.tolist(), intercepts.tolist(), binary_input) + torch.tensor(data), + bounds.tolist(), + slopes.tolist(), + intercepts.tolist(), + binary_input, + ) torch.testing.assert_allclose(torch.tensor(expected_output), actual_output) @@ -790,9 +861,7 @@ def test_index_hash_op(self): data = np.random.randint(low=0, high=1000, size=(4, 4, 4)) def _index_hash_ref(X): - ref_op = core.CreateOperator( - "IndexHash", ["X"], ["Y"], seed=0, modulo=100 - ) + ref_op = core.CreateOperator("IndexHash", ["X"], ["Y"], seed=0, modulo=100) workspace.FeedBlob("X", X) workspace.RunOperatorOnce(ref_op) return workspace.FetchBlob("Y") @@ -817,33 +886,32 @@ def _bucketize_ref(X): return workspace.FetchBlob("Y") expected_output = _bucketize_ref(data) - actual_output = torch.ops._caffe2.Bucketize( - torch.tensor(data), boundaries - ) + actual_output = torch.ops._caffe2.Bucketize(torch.tensor(data), boundaries) torch.testing.assert_allclose(expected_output, actual_output.cpu()) - @given(X=hu.tensor(), - eps=st.floats(min_value=1e-4, max_value=1e-2), - ) + @given(X=hu.tensor(), eps=st.floats(min_value=1e-4, max_value=1e-2)) def test_logit(self, X, eps): def ref(X, eps): - ref_op = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps) + ref_op = core.CreateOperator("Logit", ["X"], ["Y"], eps=eps) workspace.FeedBlob("X", X) workspace.RunOperatorOnce(ref_op) return workspace.FetchBlob("Y") + expected_output = ref(X, eps) - actual_output = torch.ops._caffe2.Logit( - torch.tensor(X), eps - ) + actual_output = torch.ops._caffe2.Logit(torch.tensor(X), eps) torch.testing.assert_allclose(expected_output, actual_output.cpu()) def test_percentile(self): - original_values = np.array([[3., 5., 3], [5., 1., 6.]]).astype(np.float32) - value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype(np.float32) + original_values = np.array([[3.0, 5.0, 3], [5.0, 1.0, 6.0]]).astype(np.float32) + value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype( + np.float32 + ) lengths = np.array([2, 1, 1]).astype(np.int32) def _percentile_ref(original_values, value_to_pct, lengths): - ref_op = core.CreateOperator('Percentile', ["original_values", "value_to_pct", "lengths"], ["Y"]) + ref_op = core.CreateOperator( + "Percentile", ["original_values", "value_to_pct", "lengths"], ["Y"] + ) workspace.FeedBlob("original_values", original_values) workspace.FeedBlob("value_to_pct", value_to_pct) workspace.FeedBlob("lengths", lengths) @@ -852,7 +920,9 @@ def _percentile_ref(original_values, value_to_pct, lengths): expected_output = _percentile_ref(original_values, value_to_pct, lengths) actual_output = torch.ops._caffe2.Percentile( - torch.tensor(original_values), torch.Tensor(value_to_pct), torch.Tensor(lengths).int() + torch.tensor(original_values), + torch.Tensor(value_to_pct), + torch.Tensor(lengths).int(), ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) @@ -862,7 +932,9 @@ def test_batch_bucket_one_hot_op(self): boundaries = np.array([0.1, 2.5, 1, 3.1, 4.5]).astype(np.float32) def _batch_bucket_one_hot_ref(data, lengths, boundaries): - ref_op = core.CreateOperator('BatchBucketOneHot', ["data", "lengths", "boundaries"], ["Y"]) + ref_op = core.CreateOperator( + "BatchBucketOneHot", ["data", "lengths", "boundaries"], ["Y"] + ) workspace.FeedBlob("data", data) workspace.FeedBlob("lengths", lengths) workspace.FeedBlob("boundaries", boundaries) @@ -875,31 +947,89 @@ def _batch_bucket_one_hot_ref(data, lengths, boundaries): ) torch.testing.assert_allclose(expected_output, actual_output.cpu()) + def test_gather_ranges_to_dense_op(self): + data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) + ranges = np.array([[[2, 4]], [[0, 0]]]) + key = np.array([0, 1, 3, 2, 1, 0, 1, 0]) + lengths = np.array([4]) + min_observation = 2 + max_mismatched_ratio = 0.5 + max_empty_ratio = 1.0 + + outputs_name = ["X_{}".format(i) for i in range(len(lengths))] + ref_op = core.CreateOperator( + "GatherRangesToDense", + ["data", "ranges", "key"], + outputs_name, + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + workspace.FeedBlob("data", data) + workspace.FeedBlob("ranges", ranges) + workspace.FeedBlob("key", key) + workspace.RunOperatorOnce(ref_op) + ref_outputs = [] + for output_name in outputs_name: + ref_outputs.append(workspace.FetchBlob(output_name)) + + outputs = torch.ops._caffe2.GatherRangesToDense( + torch.from_numpy(data), + torch.from_numpy(ranges), + torch.from_numpy(key), + lengths=lengths, + min_observation=min_observation, + max_mismatched_ratio=max_mismatched_ratio, + max_empty_ratio=max_empty_ratio, + ) + + self.assertEqual(len(ref_outputs), len(outputs)) + for i in range(0, len(ref_outputs)): + np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy()) + @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10)) @settings(deadline=1000) def test_merge_id_lists(self, lengths_0, lengths_1): def _merge_id_lists(lengths, values): ref_op = core.CreateOperator( - 'MergeIdLists', + "MergeIdLists", ["lengths_0", "values_0", "lengths_1", "values_1"], - ["merged_lengths", "merged_values"] + ["merged_lengths", "merged_values"], ) workspace.FeedBlob("lengths_0", lengths[0]) workspace.FeedBlob("values_0", values[0]) workspace.FeedBlob("lengths_1", lengths[1]) workspace.FeedBlob("values_1", values[1]) workspace.RunOperatorOnce(ref_op) - return workspace.FetchBlob("merged_lengths"), workspace.FetchBlob("merged_values") + return ( + workspace.FetchBlob("merged_lengths"), + workspace.FetchBlob("merged_values"), + ) - lengths = [np.array([lengths_0]).astype(np.int32), np.array([lengths_1]).astype(np.int32)] + lengths = [ + np.array([lengths_0]).astype(np.int32), + np.array([lengths_1]).astype(np.int32), + ] values = [ - np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype(np.int32), - np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype(np.int32) + np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype( + np.int32 + ), + np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype( + np.int32 + ), ] - expected_merged_lengths, expected_merged_values = _merge_id_lists(lengths, values) + expected_merged_lengths, expected_merged_values = _merge_id_lists( + lengths, values + ) output_merged_lengths, output_merged_values = torch.ops._caffe2.MergeIdLists( - [torch.tensor(lengths[0]), torch.tensor(values[0]), torch.tensor(lengths[1]), torch.tensor(values[1])] + [ + torch.tensor(lengths[0]), + torch.tensor(values[0]), + torch.tensor(lengths[1]), + torch.tensor(values[1]), + ] ) torch.testing.assert_allclose(expected_merged_lengths, output_merged_lengths) torch.testing.assert_allclose(expected_merged_values, output_merged_values) @@ -962,18 +1092,11 @@ def test_learning_rate(self): def test_pack_segments(self): s = torch.rand(3, 3, 3) lengths = torch.tensor([2, 1]) - packed_tensor, _ = torch.ops._caffe2.PackSegments( - lengths, - s, - ) + packed_tensor, _ = torch.ops._caffe2.PackSegments(lengths, s) self.assertEqual(packed_tensor.numpy().shape, (2, 2, 3, 3)) - unpacked_tensor = torch.ops._caffe2.UnpackSegments( - lengths, - packed_tensor, - ) + unpacked_tensor = torch.ops._caffe2.UnpackSegments(lengths, packed_tensor) torch.testing.assert_allclose(s, unpacked_tensor) - -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py index e4b739a741ac..4ccec250e22b 100644 --- a/caffe2/python/operator_test/transpose_op_test.py +++ b/caffe2/python/operator_test/transpose_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from hypothesis import given, settings diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py index 5d57940dc33e..04b98857c301 100644 --- a/caffe2/python/operator_test/trigonometric_op_test.py +++ b/caffe2/python/operator_test/trigonometric_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py index 016554321983..b49f4765539e 100644 --- a/caffe2/python/operator_test/unique_ops_test.py +++ b/caffe2/python/operator_test/unique_ops_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/unique_uniform_fill_op_test.py b/caffe2/python/operator_test/unique_uniform_fill_op_test.py index f858e8fa06bd..1026745db724 100644 --- a/caffe2/python/operator_test/unique_uniform_fill_op_test.py +++ b/caffe2/python/operator_test/unique_uniform_fill_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py index a56d1edebe68..61b01644bcf5 100644 --- a/caffe2/python/operator_test/upsample_op_test.py +++ b/caffe2/python/operator_test/upsample_op_test.py @@ -13,9 +13,9 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py index 2814d7a02775..241d1e4c1b56 100644 --- a/caffe2/python/operator_test/utility_ops_test.py +++ b/caffe2/python/operator_test/utility_ops_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from hypothesis import assume, given, settings diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py index c06183c0f1bb..f21f219bd90e 100644 --- a/caffe2/python/operator_test/video_input_op_test.py +++ b/caffe2/python/operator_test/video_input_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import os import shutil diff --git a/caffe2/python/operator_test/weight_scale_test.py b/caffe2/python/operator_test/weight_scale_test.py index 9988ebc309d2..5cdc11eb4d11 100644 --- a/caffe2/python/operator_test/weight_scale_test.py +++ b/caffe2/python/operator_test/weight_scale_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/weighted_multi_sample_test.py b/caffe2/python/operator_test/weighted_multi_sample_test.py index 8b0966590594..830a9f9849c7 100644 --- a/caffe2/python/operator_test/weighted_multi_sample_test.py +++ b/caffe2/python/operator_test/weighted_multi_sample_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/weighted_sample_test.py b/caffe2/python/operator_test/weighted_sample_test.py index 24326d6337c4..032e9e9d755e 100644 --- a/caffe2/python/operator_test/weighted_sample_test.py +++ b/caffe2/python/operator_test/weighted_sample_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py index 4940bc69a052..2c7dffe92672 100644 --- a/caffe2/python/operator_test/weighted_sum_test.py +++ b/caffe2/python/operator_test/weighted_sum_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from hypothesis import given, settings diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py index 2a48bed86358..48fe0f94731e 100644 --- a/caffe2/python/operator_test/wngrad_test.py +++ b/caffe2/python/operator_test/wngrad_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 21a61a93d00c..9a2f9f541420 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -1,6 +1,6 @@ # @package optimizer # Module caffe2.python.optimizer -from __future__ import absolute_import, division, print_function, unicode_literals + import copy import logging diff --git a/caffe2/python/optimizer_context.py b/caffe2/python/optimizer_context.py index 483f08dc5aff..d1593f440383 100644 --- a/caffe2/python/optimizer_context.py +++ b/caffe2/python/optimizer_context.py @@ -1,9 +1,9 @@ ## @package optimizer_context # Module caffe2.python.optimizer_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context from caffe2.python.modifier_context import ( diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py index a45571f19683..90f0932d23f6 100644 --- a/caffe2/python/optimizer_test.py +++ b/caffe2/python/optimizer_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.proto import caffe2_pb2 import caffe2.python.optimizer as optimizer from caffe2.python.optimizer import ( diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py index f7df35bfee70..02276b08c176 100644 --- a/caffe2/python/optimizer_test_util.py +++ b/caffe2/python/optimizer_test_util.py @@ -1,9 +1,9 @@ ## @package optimizer_test_util # Module caffe2.python.optimizer_test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py index 224dbf66b6ce..4ee446610bdb 100644 --- a/caffe2/python/parallel_workers.py +++ b/caffe2/python/parallel_workers.py @@ -1,9 +1,9 @@ # @package parallel_workers # Module caffe2.python.parallel_workers -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + ''' diff --git a/caffe2/python/parallel_workers_test.py b/caffe2/python/parallel_workers_test.py index a3367e6ee351..a9a7c6a078d7 100644 --- a/caffe2/python/parallel_workers_test.py +++ b/caffe2/python/parallel_workers_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py index b3647a2007f5..c38a4ccc34d7 100644 --- a/caffe2/python/parallelize_bmuf_distributed_test.py +++ b/caffe2/python/parallelize_bmuf_distributed_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from multiprocessing import Process, Manager diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py index 5b30da4387f3..4625d0b0458c 100644 --- a/caffe2/python/pipeline.py +++ b/caffe2/python/pipeline.py @@ -1,9 +1,9 @@ ## @package pipeline # Module caffe2.python.pipeline -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, queue_util from caffe2.python.dataio import Reader, Writer diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py index 5f57355b25d3..fe00933ac4e1 100644 --- a/caffe2/python/pipeline_test.py +++ b/caffe2/python/pipeline_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.schema import ( Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord) diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py index 7eea50464504..e0fa90bffb6e 100644 --- a/caffe2/python/predictor/mobile_exporter.py +++ b/caffe2/python/predictor/mobile_exporter.py @@ -1,10 +1,10 @@ ## @package mobile_exporter # Module caffe2.python.mobile_exporter -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, utils from caffe2.proto import caffe2_pb2 import numpy as np diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py index 1c4cf77ea051..0269ec229888 100644 --- a/caffe2/python/predictor/mobile_exporter_test.py +++ b/caffe2/python/predictor/mobile_exporter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.test_util import TestCase from caffe2.python import workspace, brew from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/predictor/predictor_exporter.py b/caffe2/python/predictor/predictor_exporter.py index e9759862fcb5..c8c68f9f30a0 100644 --- a/caffe2/python/predictor/predictor_exporter.py +++ b/caffe2/python/predictor/predictor_exporter.py @@ -1,9 +1,9 @@ ## @package predictor_exporter # Module caffe2.python.predictor.predictor_exporter -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.proto import metanet_pb2 diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py index 9c8b16c30705..2a0685fb955c 100644 --- a/caffe2/python/predictor/predictor_exporter_test.py +++ b/caffe2/python/predictor/predictor_exporter_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import tempfile import unittest diff --git a/caffe2/python/predictor/predictor_py_utils.py b/caffe2/python/predictor/predictor_py_utils.py index 1af5923952dc..cc831454a08c 100644 --- a/caffe2/python/predictor/predictor_py_utils.py +++ b/caffe2/python/predictor/predictor_py_utils.py @@ -1,9 +1,9 @@ ## @package predictor_py_utils # Module caffe2.python.predictor.predictor_py_utils -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, scope diff --git a/caffe2/python/predictor/predictor_test.py b/caffe2/python/predictor/predictor_test.py index 26c4cae63b57..64c88006686c 100644 --- a/caffe2/python/predictor/predictor_test.py +++ b/caffe2/python/predictor/predictor_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest import numpy as np diff --git a/caffe2/python/predictor/serde.py b/caffe2/python/predictor/serde.py index af48b2920a87..2b8f1544803d 100644 --- a/caffe2/python/predictor/serde.py +++ b/caffe2/python/predictor/serde.py @@ -1,9 +1,9 @@ ## @package serde # Module caffe2.python.predictor.serde -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + def serialize_protobuf_struct(protobuf_struct): diff --git a/caffe2/python/predictor_constants.py b/caffe2/python/predictor_constants.py index c1e1dedb8b09..eda0c66974f4 100644 --- a/caffe2/python/predictor_constants.py +++ b/caffe2/python/predictor_constants.py @@ -1,9 +1,9 @@ ## @package predictor_constants # Module caffe2.python.predictor_constants -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.proto.predictor_consts_pb2 as predictor_consts predictor_constants = predictor_consts.PredictorConsts() diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py index 5a8cfe4a9b46..893671b96f45 100644 --- a/caffe2/python/python_op_test.py +++ b/caffe2/python/python_op_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.core import CreatePythonOperator import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py index 62265758c2f2..c9a91fc27d17 100644 --- a/caffe2/python/queue_util.py +++ b/caffe2/python/queue_util.py @@ -1,9 +1,9 @@ ## @package queue_util # Module caffe2.python.queue_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, dataio from caffe2.python.task import TaskGroup diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py index d5f129a2f902..1170c2bf3a82 100644 --- a/caffe2/python/record_queue.py +++ b/caffe2/python/record_queue.py @@ -3,10 +3,10 @@ """ Implementation of a queue wrapper. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.dataio import Reader, Writer diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py index e5b48894efbc..d4762f08c683 100644 --- a/caffe2/python/recurrent.py +++ b/caffe2/python/recurrent.py @@ -1,9 +1,9 @@ ## @package recurrent # Module caffe2.python.recurrent -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from future.utils import viewitems, viewkeys diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py index e994de8b0c44..4042149ca80c 100644 --- a/caffe2/python/regularizer.py +++ b/caffe2/python/regularizer.py @@ -1,6 +1,6 @@ # @package optimizer # Module caffe2.python.regularizer -from __future__ import absolute_import, division, print_function, unicode_literals + from caffe2.python import core, utils import numpy as np diff --git a/caffe2/python/regularizer_context.py b/caffe2/python/regularizer_context.py index 6935fdcb47c0..5d79e138b6b7 100644 --- a/caffe2/python/regularizer_context.py +++ b/caffe2/python/regularizer_context.py @@ -1,9 +1,9 @@ # @package regularizer_context # Module caffe2.python.regularizer_context -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import context from caffe2.python.modifier_context import ( diff --git a/caffe2/python/regularizer_test.py b/caffe2/python/regularizer_test.py index 2018040433b4..685feaf93ed2 100644 --- a/caffe2/python/regularizer_test.py +++ b/caffe2/python/regularizer_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/python/rnn/__init__.py b/caffe2/python/rnn/__init__.py index a37eb20fda26..3f2ff2d6cc8f 100644 --- a/caffe2/python/rnn/__init__.py +++ b/caffe2/python/rnn/__init__.py @@ -1,5 +1,5 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py index c3bf9b30cea7..dee96413dbe5 100644 --- a/caffe2/python/rnn/lstm_comparison.py +++ b/caffe2/python/rnn/lstm_comparison.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, lstm_benchmark, utils from copy import copy diff --git a/caffe2/python/rnn/rnn_cell_test_util.py b/caffe2/python/rnn/rnn_cell_test_util.py index 1533c1e3d418..95728d682bfa 100644 --- a/caffe2/python/rnn/rnn_cell_test_util.py +++ b/caffe2/python/rnn/rnn_cell_test_util.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import workspace, scope from caffe2.python.model_helper import ModelHelper diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py index 8192b34dc12e..e16bfaaf491e 100644 --- a/caffe2/python/rnn_cell.py +++ b/caffe2/python/rnn_cell.py @@ -1,9 +1,9 @@ ## @package rnn_cell # Module caffe2.python.rnn_cell -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import functools import inspect diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py index 50fe136a5a12..fb7cadf42847 100644 --- a/caffe2/python/schema.py +++ b/caffe2/python/schema.py @@ -13,10 +13,10 @@ walkthrough on how to use schema to store and iterate through a structured in-memory dataset. """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import logging import numpy as np diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py index 28bf5c64a428..dca19a127ef2 100644 --- a/caffe2/python/schema_test.py +++ b/caffe2/python/schema_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, schema import numpy as np diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py index be05aa468d10..11fddc7b0f62 100644 --- a/caffe2/python/scope.py +++ b/caffe2/python/scope.py @@ -1,9 +1,9 @@ ## @package scope # Module caffe2.python.scope -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import contextlib import threading diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py index b24fc6851428..9bd69eb32902 100644 --- a/caffe2/python/scope_test.py +++ b/caffe2/python/scope_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import scope, core, workspace from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/serialized_test/coverage.py b/caffe2/python/serialized_test/coverage.py index 7ba93f66af6b..2014847242c4 100644 --- a/caffe2/python/serialized_test/coverage.py +++ b/caffe2/python/serialized_test/coverage.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py index 30810d9d8283..621adca9454e 100644 --- a/caffe2/python/serialized_test/serialized_test_util.py +++ b/caffe2/python/serialized_test/serialized_test_util.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import argparse from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/session.py b/caffe2/python/session.py index 9059e1eabc94..de3b09931a30 100644 --- a/caffe2/python/session.py +++ b/caffe2/python/session.py @@ -1,9 +1,9 @@ ## @package session # Module caffe2.python.session -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace diff --git a/caffe2/python/session_test.py b/caffe2/python/session_test.py index ae5e50d23ec7..fa505c296820 100644 --- a/caffe2/python/session_test.py +++ b/caffe2/python/session_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python.schema import ( Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord) diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py index 375068ef537e..e62c7e6d41dc 100644 --- a/caffe2/python/sparse_to_dense_mask_test.py +++ b/caffe2/python/sparse_to_dense_mask_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/sparse_to_dense_test.py b/caffe2/python/sparse_to_dense_test.py index 5e6d10823e5f..dc43d2c03394 100644 --- a/caffe2/python/sparse_to_dense_test.py +++ b/caffe2/python/sparse_to_dense_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/task.py b/caffe2/python/task.py index 9dcb211274b3..853433d5c38e 100644 --- a/caffe2/python/task.py +++ b/caffe2/python/task.py @@ -1,9 +1,5 @@ ## @package task # Module caffe2.python.task -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals from caffe2.python import core, context from caffe2.python.schema import Field, from_blob_list @@ -354,7 +350,9 @@ def workspace_type(self): def __repr__(self): return "TaskGroup(tasks={}, workspace_type={}, remote_nets={})".format( - self.tasks(), self.workspace_type(), self.remote_nets()) + self._tasks + self._tasks_to_add, + self.workspace_type(), + self.remote_nets()) class TaskOutput(object): diff --git a/caffe2/python/task_test.py b/caffe2/python/task_test.py index f1c51bc5b442..31adb41a0ac9 100644 --- a/caffe2/python/task_test.py +++ b/caffe2/python/task_test.py @@ -1,8 +1,3 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - import unittest from caffe2.python import task @@ -22,3 +17,8 @@ def testRepr(self): ] for obj, want in cases: self.assertEqual(obj.__repr__(), want) + + def testEffectlessRepr(self): + task_group = task.TaskGroup() + _repr = task_group.__repr__() + self.assertFalse(task_group._already_used) diff --git a/caffe2/python/test/blob_deallocation_test.py b/caffe2/python/test/blob_deallocation_test.py index 66d6835c4814..37886618ef45 100644 --- a/caffe2/python/test/blob_deallocation_test.py +++ b/caffe2/python/test/blob_deallocation_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace import unittest diff --git a/caffe2/python/test/do_op_test.py b/caffe2/python/test/do_op_test.py index 72e9f83c9540..fcc6918d5350 100644 --- a/caffe2/python/test/do_op_test.py +++ b/caffe2/python/test/do_op_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.python.test_util import TestCase diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py index 84df86fb05b0..b4db64005f62 100644 --- a/caffe2/python/test/executor_test.py +++ b/caffe2/python/test/executor_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import core, workspace from caffe2.python.test.executor_test_util import ( diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py index bf93c49d8cdc..ba10247eaa2e 100644 --- a/caffe2/python/test/executor_test_util.py +++ b/caffe2/python/test/executor_test_util.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + from caffe2.python import ( diff --git a/caffe2/python/test/fakefp16_transform_test.py b/caffe2/python/test/fakefp16_transform_test.py index d58d12ad60de..f98342eba54a 100644 --- a/caffe2/python/test/fakefp16_transform_test.py +++ b/caffe2/python/test/fakefp16_transform_test.py @@ -1,6 +1,6 @@ -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + import unittest from caffe2.python.fakefp16_transform_lib import fakeFp16FuseOps diff --git a/caffe2/python/test/gpu_context_test.py b/caffe2/python/test/gpu_context_test.py index 741f39d6dc8a..9ee8a308cc2e 100644 --- a/caffe2/python/test/gpu_context_test.py +++ b/caffe2/python/test/gpu_context_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import unittest diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py index 817f5e21a563..7790e0f6d8f5 100644 --- a/caffe2/python/test/python_protobuf_test.py +++ b/caffe2/python/test/python_protobuf_test.py @@ -1,6 +1,6 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + # make sure we use cpp implementation of protobuf import os diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py index a2cf3aced07c..94ac41524065 100644 --- a/caffe2/python/test_util.py +++ b/caffe2/python/test_util.py @@ -1,9 +1,9 @@ ## @package test_util # Module caffe2.python.test_util -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/python/text_file_reader.py b/caffe2/python/text_file_reader.py index 52a1b274f086..48f69f90c7b4 100644 --- a/caffe2/python/text_file_reader.py +++ b/caffe2/python/text_file_reader.py @@ -1,9 +1,9 @@ ## @package text_file_reader # Module caffe2.python.text_file_reader -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core from caffe2.python.dataio import Reader from caffe2.python.schema import Scalar, Struct, data_type_for_dtype diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py index 07226c128ffe..2314a3ad9c24 100644 --- a/caffe2/python/timeout_guard.py +++ b/caffe2/python/timeout_guard.py @@ -1,9 +1,9 @@ ## @package timeout_guard # Module caffe2.python.timeout_guard -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import contextlib import threading diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py index ed0a32788de8..fc1bad34b201 100644 --- a/caffe2/python/transformations.py +++ b/caffe2/python/transformations.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import caffe2.python._import_c_extension as C diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py index 363ceb19619d..14b97e4939ef 100644 --- a/caffe2/python/transformations_test.py +++ b/caffe2/python/transformations_test.py @@ -13,10 +13,10 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from hypothesis import given import hypothesis.strategies as st diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py index e95cb4bd46e3..39d37ca9fa0a 100644 --- a/caffe2/python/trt/test_trt.py +++ b/caffe2/python/trt/test_trt.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py index ce45ae3cb86d..0936941aac03 100644 --- a/caffe2/python/trt/transform.py +++ b/caffe2/python/trt/transform.py @@ -6,10 +6,10 @@ Note that ONNX-TRT enforce an NCHW input! """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op diff --git a/caffe2/python/tt_core.py b/caffe2/python/tt_core.py index a2011da16b15..314718b76c9d 100644 --- a/caffe2/python/tt_core.py +++ b/caffe2/python/tt_core.py @@ -1,8 +1,8 @@ ## @package tt_core # Module caffe2.python.tt_core -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function + + + import numpy as np diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py index aec5764e66e5..0cee3b254720 100644 --- a/caffe2/python/tt_core_test.py +++ b/caffe2/python/tt_core_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import unittest diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py index 9cf30d9c06b3..947dd9bf296d 100644 --- a/caffe2/python/utils.py +++ b/caffe2/python/utils.py @@ -1,9 +1,9 @@ # @package utils # Module caffe2.python.utils -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.proto import caffe2_pb2 from caffe2.python.compatibility import container_abcs diff --git a/caffe2/python/utils_test.py b/caffe2/python/utils_test.py index 3921f3d67ca7..ef809bfd8154 100644 --- a/caffe2/python/utils_test.py +++ b/caffe2/python/utils_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + from caffe2.python import core, utils, test_util diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index f76fcf75a33a..99983e84f097 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -1,9 +1,9 @@ ## @package workspace # Module caffe2.python.workspace -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import collections import contextlib from google.protobuf.message import Message diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 7e64220f480e..86dbcf5d70ba 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -1,7 +1,7 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals + + + + import numpy as np import os diff --git a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py index 08f658ba9608..4f4bad64980c 100644 --- a/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py +++ b/caffe2/quantization/server/batch_matmul_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections from itertools import product diff --git a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py index 27a07ece62be..1d3fd2cc369d 100644 --- a/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py +++ b/caffe2/quantization/server/batch_permutation_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py index 82dd1772d5da..24a2269cc850 100644 --- a/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py +++ b/caffe2/quantization/server/channel_shuffle_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/compute_equalization_scale.cc b/caffe2/quantization/server/compute_equalization_scale.cc new file mode 100644 index 000000000000..6e2f73ebd840 --- /dev/null +++ b/caffe2/quantization/server/compute_equalization_scale.cc @@ -0,0 +1,96 @@ +// Copyright 2004-present Facebook. All Rights Reserved. +#include "caffe2/quantization/server/compute_equalization_scale.h" +#include + +namespace caffe2 { +using namespace std; + +bool ComputeEqualizationScaleOp::RunOnDevice() { + // Generate equalization scale based on the input data (last N samples of + // the activations) and the weight + const auto& X = Input(0); + const auto& W = Input(1); + CAFFE_ENFORCE_EQ(X.dim(), 2); + CAFFE_ENFORCE_EQ(W.dim(), 2); + + const int64_t M = X.size_to_dim(1); + const int64_t N = W.size_to_dim(1); + const int64_t K = W.size_from_dim(1); + auto* S = Output(0, K, at::dtype()); + auto* S_INV = Output(1, K, at::dtype()); + const float* X_data = X.template data(); + const float* W_data = W.template data(); + float* S_data = S->template mutable_data(); + float* S_INV_data = S_INV->template mutable_data(); + + float WcolMax, XcolMax; + for (int64_t j = 0; j < K; j++) { + WcolMax = std::abs(W_data[j]); + XcolMax = std::abs(X_data[j]); + int64_t idx; + for (int64_t i = 0; i < N; i++) { + idx = i * K + j; + WcolMax = std::max(WcolMax, std::abs(W_data[idx])); + } + for (int64_t i = 0; i < M; i++) { + idx = i * K + j; + XcolMax = std::max(XcolMax, std::abs(X_data[idx])); + } + if (WcolMax == 0 || XcolMax == 0) { + S_data[j] = 1; + S_INV_data[j] = 1; + } else { + S_data[j] = std::sqrt(WcolMax / XcolMax); + S_INV_data[j] = 1 / S_data[j]; + } + } + return true; +} + +REGISTER_CPU_OPERATOR(ComputeEqualizationScale, ComputeEqualizationScaleOp); +OPERATOR_SCHEMA(ComputeEqualizationScale) + .NumInputs(2) + .NumOutputs(2) + .SetDoc(R"DOC( +Given a weight matrix W and input matrix X, the output S is the equalization parameter +vector computed from W and X, and S_INV = 1 / S + +S is computed by: +S[j] = max(abs(W[][j])) == 0 || max(abs(X[][j])) == 0 ? 1 : + sqrt(max(abs(W[][j])) / max(abs(X[][j]))), + +)DOC") + .TensorInferenceFunction([](const OperatorDef& /* def */, + const vector& in) { + vector out(2); + + if (in[0].unknown_shape() || in[1].unknown_shape()) { + out[0].set_unknown_shape(true); + out[1].set_unknown_shape(true); + return out; + } + const int64_t K = size_from_dim_(1, GetDimsVector(in[1])); + vector s_shape(2); + s_shape[0] = 1; + s_shape[1] = K; + out[0] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT); + out[1] = CreateTensorShape(s_shape, TensorProto_DataType_FLOAT); + return out; + }) + .Input( + 0, + "X", + "The input data, or last N samples of the output activations.") + .Input(1, "W", "The weight that we want to equalize with the input.") + .Output( + 0, + "S", + "Scale computed that will be multiplied to the columns of input.") + .Output( + 1, + "S_INV", + "Scale inverse that will be multiplied to the columns of weight.") + .SetDoc( + R"DOC(Operator to compute equalization scale given the input data and weight)DOC"); + +} // namespace caffe2 diff --git a/caffe2/quantization/server/compute_equalization_scale.h b/caffe2/quantization/server/compute_equalization_scale.h new file mode 100644 index 000000000000..a9facf8e1206 --- /dev/null +++ b/caffe2/quantization/server/compute_equalization_scale.h @@ -0,0 +1,18 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#pragma once +#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" +#include "caffe2/quantization/server/dnnlowp.h" + +namespace caffe2 { + +class ComputeEqualizationScaleOp final : public Operator { + public: + ComputeEqualizationScaleOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override; + +}; // class ComputeEqualizationScaleOp + +} // namespace caffe2 diff --git a/caffe2/quantization/server/compute_equalization_scale_test.py b/caffe2/quantization/server/compute_equalization_scale_test.py new file mode 100644 index 000000000000..74d34c5502d3 --- /dev/null +++ b/caffe2/quantization/server/compute_equalization_scale_test.py @@ -0,0 +1,89 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +from __future__ import absolute_import, division, print_function, unicode_literals + +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np +from caffe2.python import core, workspace +from hypothesis import given, settings + + +class TestComputeEqualizationScaleOp(hu.HypothesisTestCase): + @settings(max_examples=10) + @given( + m=st.integers(1, 50), + n=st.integers(1, 50), + k=st.integers(1, 50), + rnd_seed=st.integers(1, 5), + **hu.gcs_cpu_only + ) + def test_compute_equalization_scale(self, m, n, k, rnd_seed, gc, dc): + np.random.seed(rnd_seed) + W = np.random.rand(n, k).astype(np.float32) - 0.5 + X = np.random.rand(m, k).astype(np.float32) - 0.5 + + def ref_compute_equalization_scale(X, W): + S = np.ones([X.shape[1]]) + S_INV = np.ones([X.shape[1]]) + for j in range(W.shape[1]): + WcolMax = np.absolute(W[:, j]).max() + XcolMax = np.absolute(X[:, j]).max() + if WcolMax and XcolMax: + S[j] = np.sqrt(WcolMax / XcolMax) + S_INV[j] = 1 / S[j] + return S, S_INV + + net = core.Net("test") + + ComputeEqualizationScaleOp = core.CreateOperator( + "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"] + ) + net.Proto().op.extend([ComputeEqualizationScaleOp]) + + self.ws.create_blob("X").feed(X, device_option=gc) + self.ws.create_blob("W").feed(W, device_option=gc) + self.ws.run(net) + + S = self.ws.blobs["S"].fetch() + S_INV = self.ws.blobs["S_INV"].fetch() + S_ref, S_INV_ref = ref_compute_equalization_scale(X, W) + np.testing.assert_allclose(S, S_ref, atol=1e-3, rtol=1e-3) + np.testing.assert_allclose(S_INV, S_INV_ref, atol=1e-3, rtol=1e-3) + + def test_compute_equalization_scale_shape_inference(self): + X = np.array([[1, 2], [2, 4], [6, 7]]).astype(np.float32) + W = np.array([[2, 3], [5, 4], [8, 2]]).astype(np.float32) + ComputeEqualizationScaleOp = core.CreateOperator( + "ComputeEqualizationScale", ["X", "W"], ["S", "S_INV"] + ) + workspace.FeedBlob("X", X) + workspace.FeedBlob("W", W) + + net = core.Net("test_shape_inference") + net.Proto().op.extend([ComputeEqualizationScaleOp]) + shapes, types = workspace.InferShapesAndTypes( + [net], + blob_dimensions={"X": X.shape, "W": W.shape}, + blob_types={"X": core.DataType.FLOAT, "W": core.DataType.FLOAT}, + ) + assert ( + "S" in shapes and "S" in types and "S_INV" in shapes and "S_INV" in types + ), "Failed to infer the shape or type of output" + self.assertEqual(shapes["S"], [1, 2]) + self.assertEqual(shapes["S_INV"], [1, 2]) + self.assertEqual(types["S"], core.DataType.FLOAT) + self.assertEqual(types["S_INV"], core.DataType.FLOAT) diff --git a/caffe2/quantization/server/concat_dnnlowp_op_test.py b/caffe2/quantization/server/concat_dnnlowp_op_test.py index 777c523aff87..fc7e897993d4 100644 --- a/caffe2/quantization/server/concat_dnnlowp_op_test.py +++ b/caffe2/quantization/server/concat_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py index 70bcf53f44d4..a605ea3fc49e 100644 --- a/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/conv_depthwise_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py index ae2f49cfe20c..68c14b69f058 100644 --- a/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/conv_dnnlowp_acc16_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_dnnlowp_op_test.py b/caffe2/quantization/server/conv_dnnlowp_op_test.py index 682a4d787aba..11cd12a4d5bc 100644 --- a/caffe2/quantization/server/conv_dnnlowp_op_test.py +++ b/caffe2/quantization/server/conv_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py index 9ed9106db0be..715b6f8c01a8 100644 --- a/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_acc16_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py index 773253743c6d..99e914c294b9 100644 --- a/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/conv_groupwise_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py index 399ae4363831..5694a553e744 100644 --- a/caffe2/quantization/server/dequantize_dnnlowp_op_test.py +++ b/caffe2/quantization/server/dequantize_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/dnnlowp_test_utils.py b/caffe2/quantization/server/dnnlowp_test_utils.py index 1a41664cb2d1..0d56ea6ac127 100644 --- a/caffe2/quantization/server/dnnlowp_test_utils.py +++ b/caffe2/quantization/server/dnnlowp_test_utils.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py index 1cf65f37858a..75bd2f8e4d44 100644 --- a/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_add_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py index 3f199f981331..af1cd0f80684 100644 --- a/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_linear_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py index b9104f598d08..e31b9d179071 100644 --- a/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_mul_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py index 9b3caf41ecc5..faf526b8c48d 100644 --- a/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py +++ b/caffe2/quantization/server/elementwise_sum_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/elementwise_sum_relu_op.cc b/caffe2/quantization/server/elementwise_sum_relu_op.cc index df4b726c7306..dbb14c0c5ce8 100644 --- a/caffe2/quantization/server/elementwise_sum_relu_op.cc +++ b/caffe2/quantization/server/elementwise_sum_relu_op.cc @@ -42,11 +42,13 @@ class SumReluOp : public SumOp { bool RunOnDevice() override { if (Input(0).template IsType()) { return DoRunWithType(); + } else if (Input(0).template IsType()) { + return DoRunWithType(); } else if (Input(0).template IsType()) { return DoRunWithType(); } else { CAFFE_THROW( - "Sum operator only supports 32-bit float and ints, but", + "Sum operator only supports 32-bit float, 64-bit double and ints, but", " input was of type ", Input(0).dtype().name()); } diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py index 68059421cfac..5d77eceb8e04 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py +++ b/caffe2/quantization/server/fully_connected_dnnlowp_acc16_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py index b8c4a3e22812..f1939e198b84 100644 --- a/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py +++ b/caffe2/quantization/server/fully_connected_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_fp16_test.py b/caffe2/quantization/server/fully_connected_fp16_test.py index 710207f7caeb..be1e2c8a1ab5 100644 --- a/caffe2/quantization/server/fully_connected_fp16_test.py +++ b/caffe2/quantization/server/fully_connected_fp16_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py index a4ba681867ff..284ae56d743e 100644 --- a/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py +++ b/caffe2/quantization/server/fully_connected_rowwise_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/gather_dnnlowp_op_test.py b/caffe2/quantization/server/gather_dnnlowp_op_test.py index c1f495260722..c2c7f35a66d4 100644 --- a/caffe2/quantization/server/gather_dnnlowp_op_test.py +++ b/caffe2/quantization/server/gather_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py index 93a4163c86bb..30051d95b59c 100644 --- a/caffe2/quantization/server/group_norm_dnnlowp_op_test.py +++ b/caffe2/quantization/server/group_norm_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/int8_gen_quant_params_test.py b/caffe2/quantization/server/int8_gen_quant_params_test.py index f2c7fd81dabb..d208d6f9b575 100644 --- a/caffe2/quantization/server/int8_gen_quant_params_test.py +++ b/caffe2/quantization/server/int8_gen_quant_params_test.py @@ -13,7 +13,7 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py index f34081aeba24..70f9b0c2f1fa 100644 --- a/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py +++ b/caffe2/quantization/server/int8_quant_scheme_blob_fill_test.py @@ -13,7 +13,7 @@ # limitations under the License. ############################################################################## -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu from caffe2.python import core, workspace diff --git a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py index 9cd22bd2c491..bcf06ce0274e 100644 --- a/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py +++ b/caffe2/quantization/server/lstm_unit_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/observer_test.py b/caffe2/quantization/server/observer_test.py index 4299c146b2da..5c2b28e5e6fb 100644 --- a/caffe2/quantization/server/observer_test.py +++ b/caffe2/quantization/server/observer_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import numpy as np from caffe2.python import core, workspace diff --git a/caffe2/quantization/server/pool_dnnlowp_op_test.py b/caffe2/quantization/server/pool_dnnlowp_op_test.py index d581fbef00cd..fedc87ee732a 100644 --- a/caffe2/quantization/server/pool_dnnlowp_op_test.py +++ b/caffe2/quantization/server/pool_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/quantize_dnnlowp_op_test.py b/caffe2/quantization/server/quantize_dnnlowp_op_test.py index caaf456fb84e..e61a28b4b930 100644 --- a/caffe2/quantization/server/quantize_dnnlowp_op_test.py +++ b/caffe2/quantization/server/quantize_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/relu_dnnlowp_op_test.py b/caffe2/quantization/server/relu_dnnlowp_op_test.py index 5e85b4e43ed6..68b5aed049f1 100644 --- a/caffe2/quantization/server/relu_dnnlowp_op_test.py +++ b/caffe2/quantization/server/relu_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py index 47ae47b81106..67017ee0afcc 100644 --- a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py +++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py index 6af92a5d2fe5..b12b3908aafa 100644 --- a/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py +++ b/caffe2/quantization/server/resize_nearest_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st diff --git a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py index 28ff4a0a750b..836745dcf543 100644 --- a/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py +++ b/caffe2/quantization/server/sigmoid_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py index b1d34c19d3ae..d7253b1675f4 100644 --- a/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py +++ b/caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/tanh_dnnlowp_op_test.py b/caffe2/quantization/server/tanh_dnnlowp_op_test.py index e0af7af62bba..f73befd25e26 100644 --- a/caffe2/quantization/server/tanh_dnnlowp_op_test.py +++ b/caffe2/quantization/server/tanh_dnnlowp_op_test.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import collections diff --git a/caffe2/quantization/server/utils.py b/caffe2/quantization/server/utils.py index 862ed5a9cd62..9e137cb5f6af 100644 --- a/caffe2/quantization/server/utils.py +++ b/caffe2/quantization/server/utils.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import, division, print_function, unicode_literals + import copy import logging diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h index 64ab19937929..bb70e0f85885 100644 --- a/caffe2/queue/queue_ops.h +++ b/caffe2/queue/queue_ops.h @@ -113,6 +113,12 @@ class SafeEnqueueBlobsOp final : public Operator { 1, !status, Output(size)->template mutable_data(), &context_); return true; } + + void Cancel() override { + auto queue = Operator::Inputs()[0] + ->template Get>(); + queue->close(); + } }; template @@ -192,6 +198,12 @@ class SafeDequeueBlobsOp final : public Operator { return true; } + void Cancel() override { + auto queue = Operator::Inputs()[0] + ->template Get>(); + queue->close(); + } + private: int numRecords_; std::vector blobs_; diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu index 814a24c74183..2347f0cd8bc8 100644 --- a/caffe2/sgd/adagrad_fused_op_gpu.cu +++ b/caffe2/sgd/adagrad_fused_op_gpu.cu @@ -308,69 +308,132 @@ __global__ void rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel( const float LR = lr[0]; // num_indices blocks, each block process one index - int sorted_linear_indice_id = blockIdx.x; // the index of sorted_linear_ind + int sorted_linear_indice_id; + if (ExactBlock) { + sorted_linear_indice_id = + blockIdx.x * blockDim.y + threadIdx.y; // the index of sorted_linear_ind + } else { + sorted_linear_indice_id = blockIdx.x; // the index of sorted_linear_ind + } if (sorted_linear_indice_id >= num_indices) { // don't have warp divergence when embedding dim is multiple of 32 return; } + // the index row in the embedding table + SIndex index = sorted_linear_ind_data[sorted_linear_indice_id]; + // check if this thread block is responsible for this whole linear index bool linear_index_start = (sorted_linear_indice_id == 0 || - sorted_linear_ind_data[sorted_linear_indice_id - 1] != - sorted_linear_ind_data[sorted_linear_indice_id]); + sorted_linear_ind_data[sorted_linear_indice_id - 1] != index); if (!linear_index_start) { // don't have warp divergence when embedding dim is multiple of 32 return; } - // the index row in the embedding table - SIndex index = sorted_linear_ind_data[sorted_linear_indice_id]; - // find the num of duplicated indices. - int num_dup = 1; - while (sorted_linear_indice_id + num_dup < num_indices && - sorted_linear_ind_data[sorted_linear_indice_id + num_dup] == index) { - num_dup += 1; - } + if (ExactBlock) { + // find the num of duplicated indices. + int num_dup = 1; + while (true) { + int segment_continue = 0; + if (sorted_linear_indice_id + num_dup + threadIdx.x < num_indices) { + segment_continue = + sorted_linear_ind_data[sorted_linear_indice_id + num_dup + threadIdx.x] == + index; + } +#ifndef __HIP_PLATFORM_HCC__ + int32_t num_dup_incr = __popc(__ballot_sync(0xFFFFFFFF, segment_continue)); +#else + int32_t num_dup_incr = __popc(__ballot(segment_continue)); +#endif + num_dup += num_dup_incr; + if (num_dup_incr != kWarpSize) { + break; + } + } - // TODO: Tuning NumThreads for sum_squares - typedef cub::BlockReduce BlockReduce; - __shared__ BlockReduce::TempStorage temp_storage; - int valid = min(block_size, blockDim.x); + float sum_squares = 0.0; + extern __shared__ float x_ij[]; - float sum_squares = 0.0; - __shared__ float row_sum_squares_avg; - extern __shared__ float x_ij[]; + // we need to avoid index collision for the threads in the same block. + // Different threadIdx.y works on different `index`. + int sm_offset = threadIdx.y * block_size; - for (int i = threadIdx.x; i < block_size; i += blockDim.x) { - // i: index in the embedding dimension - float t_x_ij = 0.0; + for (int i = threadIdx.x; i < block_size; i += blockDim.x) { + // i: index in the embedding dimension + float t_x_ij = 0.0; - for (int dup_id = 0; dup_id < num_dup; dup_id++) { - int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id]; - t_x_ij += grad[group * block_size + i]; + for (int dup_id = 0; dup_id < num_dup; dup_id++) { + int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id]; + t_x_ij += grad[group * block_size + i]; + } + t_x_ij += weight_decay * + rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]); + sum_squares += t_x_ij * t_x_ij; + + x_ij[sm_offset + i] = t_x_ij; } - t_x_ij += weight_decay * - rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]);; - sum_squares += t_x_ij * t_x_ij; - x_ij[i] = t_x_ij; - } - float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid); - if (threadIdx.x == 0) { - row_sum_squares_avg = reduce_result / static_cast(block_size); - float mom_new = param_mom[index] + static_cast(row_sum_squares_avg); + // We have a strong assumption that blockDim.x = 32, which is equal to the warp size. + float row_sum_squares_avg = warpReduceAllSum(sum_squares) / static_cast(block_size); + float mom_new = param_mom[index] + row_sum_squares_avg; param_mom[index] = mom_new; - } - __syncthreads(); - // update param - float step = LR / (sqrtf(param_mom[index]) + epsilon); - for (int i = threadIdx.x; i < block_size; i += blockDim.x) { - const size_t paramIdx = index * block_size + i; // index for param - param[paramIdx] = - rand_factor.convertTypeFromTargetToParam(param[paramIdx] + x_ij[i] * step); + // update param + float step = LR / (sqrtf(mom_new) + epsilon); + for (int i = threadIdx.x; i < block_size; i += blockDim.x) { + const size_t paramIdx = index * block_size + i; // index for param + param[paramIdx] = rand_factor.convertTypeFromTargetToParam( + rand_factor.convertTypeFromParamToTarget(param[paramIdx]) + x_ij[sm_offset + i] * step); + } + } else { + // find the num of duplicated indices. + int num_dup = 1; + while (sorted_linear_indice_id + num_dup < num_indices && + sorted_linear_ind_data[sorted_linear_indice_id + num_dup] == index) { + num_dup += 1; + } + + // TODO: Tuning NumThreads for sum_squares + typedef cub::BlockReduce BlockReduce; + __shared__ BlockReduce::TempStorage temp_storage; + int valid = min(block_size, blockDim.x); + + float sum_squares = 0.0; + __shared__ float row_sum_squares_avg; + extern __shared__ float x_ij[]; + + for (int i = threadIdx.x; i < block_size; i += blockDim.x) { + // i: index in the embedding dimension + float t_x_ij = 0.0; + + for (int dup_id = 0; dup_id < num_dup; dup_id++) { + int group = sorted_seg_id_data[sorted_linear_indice_id + dup_id]; + t_x_ij += grad[group * block_size + i]; + } + t_x_ij += weight_decay * + rand_factor.convertTypeFromParamToTarget(param[index * block_size + i]); + sum_squares += t_x_ij * t_x_ij; + x_ij[i] = t_x_ij; + } + float reduce_result = BlockReduce(temp_storage).Sum(sum_squares, valid); + + if (threadIdx.x == 0) { + row_sum_squares_avg = reduce_result / static_cast(block_size); + float mom_new = param_mom[index] + row_sum_squares_avg; + param_mom[index] = mom_new; + } + __syncthreads(); + + // update param + float step = LR / (sqrtf(param_mom[index]) + epsilon); + for (int i = threadIdx.x; i < block_size; i += blockDim.x) { + const size_t paramIdx = index * block_size + i; // index for param + param[paramIdx] = rand_factor.convertTypeFromTargetToParam( + rand_factor.convertTypeFromParamToTarget(param[paramIdx]) + x_ij[i] * step); + } } } @@ -570,7 +633,10 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final is_mean ? grad_buffer_.template mutable_data() : NULL; if (is_mean) { gradient_mean_kernel - <<>>( + <<>>( grad, lengths, grad_buffer_data, block_size); } @@ -934,7 +1000,10 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final is_mean ? grad_buffer_.template mutable_data() : NULL; if (is_mean) { gradient_mean_kernel - <<>>( + <<>>( grad, lengths, grad_buffer_data, block_size); } @@ -1179,10 +1248,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final sorted_seg_id_buffer_.ResizeLike(indicesInput); linear_index_weight_offsets_dedup_kernel - <<>>( + <<>>( indices, prefix_sum_length_data, seg_id_buffer_.template mutable_data()); @@ -1206,60 +1272,137 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final seed.y = maxThreads * block_size; } - CAFFE_ENFORCE_LE(block_size, 10240, - "Block size is too big and will exceed the max size of the shared memory"); - if (round_option_ == STOCHASTIC) { - rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< - IndexType, - TParam, - T, - false, - STOCHASTIC> - <<>>( - prefix_sum_length_data, - N, - block_size, - num_lengths, - num_indices, - epsilon_, - paramOut, - momentOut, - indices, - is_mean ? grad_buffer_data : grad, - sorted_linear_ind_buffer_.template data(), - sorted_seg_id_buffer_.template data(), - lr, - seed, - weight_decay_); + if (block_size <= maxThreads / 2 && block_size % 32 == 0) { + // Fast path when the embedding dimension is a multiple of 32, using + // WarpReduce. + constexpr int kWarpNum = 8; + const dim3 threads(kWarpSize, kWarpNum); + const dim3 blocks((num_indices + kWarpNum - 1) / kWarpNum); + CAFFE_ENFORCE_LE( + kWarpNum * kWarpSize, + maxThreads, + "the total number of threads in a block should be smaller than or equal to maxThreads"); + + const int sm_size = block_size * kWarpNum * sizeof(float); + // Maximum shared memory allocated per thread block is 48 KB on Maxwell/Pascal + CAFFE_ENFORCE_LE( + sm_size, + 1024 * 48, + "Block size is too big and will exceed the max size of the shared memory"); + + if (round_option_ == STOCHASTIC) { + rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< + IndexType, + TParam, + T, + true, + STOCHASTIC> + <<>>( + prefix_sum_length_data, + N, + block_size, + num_lengths, + num_indices, + epsilon_, + paramOut, + momentOut, + indices, + is_mean ? grad_buffer_data : grad, + sorted_linear_ind_buffer_.template data(), + sorted_seg_id_buffer_.template data(), + lr, + seed, + weight_decay_); + } else { + rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< + IndexType, + TParam, + T, + true, + NEAREST> + <<>>( + prefix_sum_length_data, + N, + block_size, + num_lengths, + num_indices, + epsilon_, + paramOut, + momentOut, + indices, + is_mean ? grad_buffer_data : grad, + sorted_linear_ind_buffer_.template data(), + sorted_seg_id_buffer_.template data(), + lr, + seed, + weight_decay_); + } } else { - rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< - IndexType, - TParam, - T, - false, - NEAREST> - <<>>( - prefix_sum_length_data, - N, - block_size, - num_lengths, - num_indices, - epsilon_, - paramOut, - momentOut, - indices, - is_mean ? grad_buffer_data : grad, - sorted_linear_ind_buffer_.template data(), - sorted_seg_id_buffer_.template data(), - lr, - seed, - weight_decay_); + const int sm_size = block_size * sizeof(float); + // Maximum shared memory allocated per thread block is 48 KB on Maxwell/Pascal + CAFFE_ENFORCE_LE( + sm_size, + 1024 * 48, + "Block size is too big and will exceed the max size of the shared memory"); + if (round_option_ == STOCHASTIC) { + rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< + IndexType, + TParam, + T, + false, + STOCHASTIC> + <<>>( + prefix_sum_length_data, + N, + block_size, + num_lengths, + num_indices, + epsilon_, + paramOut, + momentOut, + indices, + is_mean ? grad_buffer_data : grad, + sorted_linear_ind_buffer_.template data(), + sorted_seg_id_buffer_.template data(), + lr, + seed, + weight_decay_); + } else { + rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< + IndexType, + TParam, + T, + false, + NEAREST> + <<>>( + prefix_sum_length_data, + N, + block_size, + num_lengths, + num_indices, + epsilon_, + paramOut, + momentOut, + indices, + is_mean ? grad_buffer_data : grad, + sorted_linear_ind_buffer_.template data(), + sorted_seg_id_buffer_.template data(), + lr, + seed, + weight_decay_); + } } return true; diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cuh b/caffe2/sgd/adagrad_fused_op_gpu.cuh index 9a5f53bead12..e695dac37e4d 100644 --- a/caffe2/sgd/adagrad_fused_op_gpu.cuh +++ b/caffe2/sgd/adagrad_fused_op_gpu.cuh @@ -26,6 +26,27 @@ namespace caffe2 { +constexpr int kWarpSize = 32; + +template +inline __device__ T shfl_xor(const T val, int laneMask, int width = kWarpSize) { +#ifndef __HIP_PLATFORM_HCC__ + return __shfl_xor_sync(0xffffffff, val, laneMask, width); +#else + return __shfl_xor(val, laneMask, width); +#endif +} + +/// Sums a register value across all warp threads +template +inline __device__ T warpReduceAllSum(T val) { +#pragma unroll + for (int mask = ReduceWidth / 2; mask > 0; mask >>= 1) { + val += shfl_xor(val, mask); + } + return val; +} + enum roundOption : int { NEAREST = 0, STOCHASTIC = 1 }; template diff --git a/caffe2/sgd/learning_rate_op.cc b/caffe2/sgd/learning_rate_op.cc index 534f89d68360..e8172ab65efe 100644 --- a/caffe2/sgd/learning_rate_op.cc +++ b/caffe2/sgd/learning_rate_op.cc @@ -164,7 +164,7 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU( "int? max_iter = -1, " "int? num_iter = 0, " "float? start_multiplier = 0, " - "float? end_mulitplier = 0, " + "float? end_multiplier = 0, " "float? multiplier = 0.5, " "float? multiplier_1 = 1.0, " "float? multiplier_2 = 1.0, " @@ -184,5 +184,6 @@ C10_EXPORT_CAFFE2_OP_TO_C10_CPU( "float? cosine_max_lr = 0.05, " "int? cosine_period = 50, " "float? cosine_t_mult = 1.0, " - "float? cosine_lr_shrink = 0.99) -> Tensor output", + "float? cosine_lr_shrink = 0.99, " + "float? decay = 1.0) -> Tensor output", LearningRateOpFloatCPU); diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h index fa35ad4c8d6f..fb0998a65d71 100644 --- a/caffe2/sgd/learning_rate_op.h +++ b/caffe2/sgd/learning_rate_op.h @@ -62,7 +62,7 @@ class LearningRateOp final : public Operator { active_period, inactive_period, active_first); } else if (policy == "hill") { int64_t num_iter = - this->template GetSingleArgument(arg_prefix + "num_iter", 0); + this->template GetSingleArgument(arg_prefix + "num_iter", 0); DCHECK_GT(num_iter, 0); T start_multiplier = this->template GetSingleArgument( arg_prefix + "start_multiplier", 0.); @@ -81,13 +81,13 @@ class LearningRateOp final : public Operator { return new HillLearningRate( num_iter, start_multiplier, gamma, power, end_multiplier); } else if (policy == "slope") { - int64_t num_iter_1 = - this->template GetSingleArgument(arg_prefix + "num_iter_1", 0); + int64_t num_iter_1 = this->template GetSingleArgument( + arg_prefix + "num_iter_1", 0); DCHECK_GT(num_iter_1, 0); T multiplier_1 = this->template GetSingleArgument( arg_prefix + "multiplier_1", 0.); - int64_t num_iter_2 = - this->template GetSingleArgument(arg_prefix + "num_iter_2", 0); + int64_t num_iter_2 = this->template GetSingleArgument( + arg_prefix + "num_iter_2", 0); DCHECK_GT(num_iter_1, 0); T multiplier_2 = this->template GetSingleArgument( arg_prefix + "multiplier_2", 0.); @@ -191,16 +191,16 @@ class LearningRateOp final : public Operator { int stepsize = this->template GetSingleArgument(arg_prefix + "stepsize", 0); T decay = - this->template GetSingleArgument(arg_prefix + "decay", 1.0); + this->template GetSingleArgument(arg_prefix + "decay", 1.0); DCHECK_GT(stepsize, 0); DCHECK_GE(max_lr, base_lr_); return new CyclicalLearningRate(base_lr_, max_lr, stepsize, decay); } else if (policy == "constantThenLinearWarmup") { T start_warmup_multiplier = this->template GetSingleArgument( arg_prefix + "start_warmup_multiplier", 0.1); - int64_t constant_warmup_num_iter = this->template GetSingleArgument( + int64_t constant_warmup_num_iter = this->template GetSingleArgument( arg_prefix + "constant_warmup_num_iter", 10000000); - int64_t linear_warmup_num_iter = this->template GetSingleArgument( + int64_t linear_warmup_num_iter = this->template GetSingleArgument( arg_prefix + "linear_warmup_num_iter", 10000000); return new ConstantThenLinearWarmupLearningRate( start_warmup_multiplier, @@ -209,9 +209,9 @@ class LearningRateOp final : public Operator { } else if (policy == "compositeCyclical") { T start_warmup_multiplier = this->template GetSingleArgument( arg_prefix + "start_warmup_multiplier", 0.1); - int64_t constant_warmup_num_iter = this->template GetSingleArgument( + int64_t constant_warmup_num_iter = this->template GetSingleArgument( arg_prefix + "constant_warmup_num_iter", 10000000); - int64_t linear_warmup_num_iter = this->template GetSingleArgument( + int64_t linear_warmup_num_iter = this->template GetSingleArgument( arg_prefix + "linear_warmup_num_iter", 10000000); T cyclical_max_lr = this->template GetSingleArgument( arg_prefix + "cyclical_max_lr", 0.05); @@ -245,9 +245,9 @@ class LearningRateOp final : public Operator { } else if (policy == "compositeCosine") { T start_warmup_multiplier = this->template GetSingleArgument( arg_prefix + "start_warmup_multiplier", 0.1); - int64_t constant_warmup_num_iter = this->template GetSingleArgument( + int64_t constant_warmup_num_iter = this->template GetSingleArgument( arg_prefix + "constant_warmup_num_iter", 10000000); - int64_t linear_warmup_num_iter = this->template GetSingleArgument( + int64_t linear_warmup_num_iter = this->template GetSingleArgument( arg_prefix + "linear_warmup_num_iter", 10000000); T cosine_max_lr = this->template GetSingleArgument( arg_prefix + "cosine_max_lr", 0.5); diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt index 798985953b89..62190501cdac 100644 --- a/caffe2/utils/CMakeLists.txt +++ b/caffe2/utils/CMakeLists.txt @@ -1,9 +1,13 @@ if((NOT BUILD_CAFFE2) OR (INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)) list(APPEND Caffe2_CPU_SRCS utils/string_utils.cc - utils/threadpool/pthreadpool-cpp.cc utils/threadpool/ThreadPool.cc ) + + if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL) + list(APPEND Caffe2_CPU_SRCS utils/threadpool/pthreadpool-cpp.cc) + endif() + if(NOT BUILD_CAFFE2) list(APPEND Caffe2_CPU_SRCS utils/proto_wrap.cc diff --git a/caffe2/utils/GpuDefs.cuh b/caffe2/utils/GpuDefs.cuh index 46d8058c84b5..be591cc95b92 100644 --- a/caffe2/utils/GpuDefs.cuh +++ b/caffe2/utils/GpuDefs.cuh @@ -7,16 +7,9 @@ namespace caffe2 { // Static definition of GPU warp size for unrolling and code generation -#ifdef __CUDA_ARCH__ -#if __CUDA_ARCH__ <= 800 -constexpr int kWarpSize = 32; -#else -#error Unknown __CUDA_ARCH__; please define parameters for compute capability -#endif // __CUDA_ARCH__ types -#elif defined(__HIP_PLATFORM_HCC__) +#if defined(__HIP_PLATFORM_HCC__) constexpr int kWarpSize = warpSize; // = 64 (Defined in hip_runtime.h) #else -// dummy value for host compiler constexpr int kWarpSize = 32; #endif // __CUDA_ARCH__ diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index 63e2d9f4d934..dbfd55e2d0d5 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -144,7 +144,7 @@ if(INTERN_BUILD_ATEN_OPS) endforeach() list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp}) - file(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py") + file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py") set(GEN_ROCM_FLAG) if(USE_ROCM) @@ -167,7 +167,7 @@ if(INTERN_BUILD_ATEN_OPS) endif() execute_process( COMMAND - "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_whitelist.py + "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_allowlist.py --op-dependency "${OP_DEPENDENCY}" --root-ops "${SELECTED_OP_LIST}" OUTPUT_VARIABLE OP_REGISTRATION_WHITELIST diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 028098f61d36..1bbb98fb3614 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1253,10 +1253,7 @@ if(USE_CUDA) endif() if(USE_GLOO) - if(MSVC) - message(WARNING "Gloo can not be used on Windows.") - caffe2_update_option(USE_GLOO OFF) - elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) message(WARNING "Gloo can only be used on 64-bit systems.") caffe2_update_option(USE_GLOO OFF) else() @@ -1507,7 +1504,8 @@ if(NOT INTERN_BUILD_MOBILE) if(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5) message(STATUS "Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor") - list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__") + list(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1" "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__") add_compile_options(-DCUDA_HAS_FP16=1) else() message(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor") diff --git a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake index c17dfa751417..9caf2f408a16 100644 --- a/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake +++ b/cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake @@ -84,9 +84,19 @@ endif() if(CUDA_VERSION VERSION_GREATER "10.5") list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Ampere") - list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0" "8.0+PTX") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0") list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.0") + if(CUDA_VERSION VERSION_LESS "11.1") + set(CUDA_LIMIT_GPU_ARCHITECTURE "8.6") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.0+PTX") + endif() +endif() + +if(CUDA_VERSION VERSION_GREATER "11.0") + list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "8.6" "8.6+PTX") + list(APPEND CUDA_ALL_GPU_ARCHITECTURES "8.6") + if(CUDA_VERSION VERSION_LESS "12.0") set(CUDA_LIMIT_GPU_ARCHITECTURE "9.0") endif() diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake index 32cab7557f3b..9a4ad35567bd 100644 --- a/cmake/ProtoBuf.cmake +++ b/cmake/ProtoBuf.cmake @@ -196,7 +196,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var) # If we remove all reference to these pb.h files from external # libraries and binaries this rewrite can be removed. - COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake + COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil} COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM ) @@ -209,6 +209,7 @@ function(caffe2_protobuf_generate_cpp_py srcs_var hdrs_var python_var) COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --cpp_out=${DLLEXPORT_STR}${PROJECT_BINARY_DIR} ${abs_fil} COMMAND ${CAFFE2_PROTOC_EXECUTABLE} -I${PROJECT_SOURCE_DIR} --python_out "${PROJECT_BINARY_DIR}" ${abs_fil} + COMMAND ${CMAKE_COMMAND} -DFILENAME=${CMAKE_CURRENT_BINARY_DIR}/${fil_we}.pb.h -DNAMESPACES=caffe\;caffe2\;onnx\;torch -DLOCAL_PROTOBUF=${CAFFE2_LINK_LOCAL_PROTOBUF} -P ${PROJECT_SOURCE_DIR}/cmake/ProtoBufPatch.cmake DEPENDS ${CAFFE2_PROTOC_EXECUTABLE} ${abs_fil} COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM ) endif() diff --git a/cmake/ProtoBufPatch.cmake b/cmake/ProtoBufPatch.cmake index 2124b6189799..704dcd7da154 100644 --- a/cmake/ProtoBufPatch.cmake +++ b/cmake/ProtoBufPatch.cmake @@ -1,41 +1,83 @@ # CMake file to replace the string contents in ONNX, Caffe, and Caffe2 proto. # Usage example: -# cmake -DFILENAME=caffe2.pb.h -P ProtoBufPatch.cmake +# cmake -DFILENAME=caffe2.pb.h -DLOCAL_PROTOBUF=ON -P ProtoBufPatch.cmake file(READ ${FILENAME} content) -# protobuf-3.6.0 pattern -string( - REPLACE - "::google::protobuf::internal::GetEmptyStringAlreadyInited" - "GetEmptyStringAlreadyInited" - content - "${content}") +if(LOCAL_PROTOBUF) + # protobuf-3.6.0 pattern + string( + REPLACE + "::google::protobuf::internal::GetEmptyStringAlreadyInited" + "GetEmptyStringAlreadyInited" + content + "${content}") -# protobuf-3.8.0+ pattern -string( - REPLACE - "::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited" - "GetEmptyStringAlreadyInited" - content - "${content}") + # protobuf-3.8.0+ pattern + string( + REPLACE + "::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited" + "GetEmptyStringAlreadyInited" + content + "${content}") -string( - REPLACE - "PROTOBUF_CONSTEXPR" - "" - content - "${content}") + string( + REPLACE + "PROTOBUF_CONSTEXPR" + "" + content + "${content}") -# https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c -# changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows -# build. -string( - REGEX REPLACE - "static constexpr ([^ ]+) ([^ ]+) =" - "static \\1 const \\2 =" - content - "${content}") + # https://github.com/protocolbuffers/protobuf/commit/0400cca3236de1ca303af38bf81eab332d042b7c + # changes PROTOBUF_CONSTEXPR to constexpr, which breaks windows + # build. + string( + REGEX REPLACE + "static constexpr ([^ ]+) ([^ ]+) =" + "static \\1 const \\2 =" + content + "${content}") + + foreach(ns ${NAMESPACES}) + # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within + # the namespace and make sure we only do it once in the file. Unfortunately + # using string(REPLACE ...) doesn't work because it will replace at all + # locations and there might be multiple declarations of the namespace + # depending on how the proto is structured. + set(search "namespace ${ns} {") + string(LENGTH "${search}" search_len) + string(FIND "${content}" "${search}" pos) + if(${pos} GREATER -1) + math(EXPR pos "${pos}+${search_len}") + string(SUBSTRING "${content}" 0 ${pos} content_pre) + string(SUBSTRING "${content}" ${pos} -1 content_post) + string( + CONCAT + content + "${content_pre}" + " const ::std::string& GetEmptyStringAlreadyInited(); " + "${content_post}") + endif() + endforeach() + + # The moving constructor is defined in the header file, which will cause + # a link error that claims that the vftable is not found. Luckily, we + # could move the definition into the source file to solve the problem. + list(LENGTH NAMESPACES ns_count) + if("${FILENAME}" MATCHES ".pb.h" AND ns_count EQUAL 1) + string(REPLACE ".pb.h" ".pb.cc" SOURCE_FILENAME ${FILENAME}) + file(READ ${SOURCE_FILENAME} content_cc_origin) + + string(REGEX MATCHALL "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept[^}]*}" content_cc "${content}") + string(REGEX REPLACE "};" "}\n" content_cc "${content_cc}") + string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept" " \\1::\\1(\\1&& from) noexcept" content_cc "${content_cc}") + set(content_cc "${content_cc_origin}\nnamespace ${NAMESPACES} {\n#if LANG_CXX11\n${content_cc}\n#endif\n}") + + string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept([^}]*)}" "\\1(\\1&& from) noexcept;" content "${content}") + + file(WRITE ${SOURCE_FILENAME} "${content_cc}") + endif() +endif() # constexpr int TensorBoundShape_DimType_DimType_ARRAYSIZE = TensorBoundShape_DimType_DimType_MAX + 1; # throws @@ -53,44 +95,4 @@ string( content "${content}") -foreach(ns ${NAMESPACES}) - # Insert "const ::std::string& GetEmptyStringAlreadyInited();" within - # the namespace and make sure we only do it once in the file. Unfortunately - # using string(REPLACE ...) doesn't work because it will replace at all - # locations and there might be multiple declarations of the namespace - # depending on how the proto is structured. - set(search "namespace ${ns} {") - string(LENGTH "${search}" search_len) - string(FIND "${content}" "${search}" pos) - if(${pos} GREATER -1) - math(EXPR pos "${pos}+${search_len}") - string(SUBSTRING "${content}" 0 ${pos} content_pre) - string(SUBSTRING "${content}" ${pos} -1 content_post) - string( - CONCAT - content - "${content_pre}" - " const ::std::string& GetEmptyStringAlreadyInited(); " - "${content_post}") - endif() -endforeach() - -# The moving constructor is defined in the header file, which will cause -# a link error that claims that the vftable is not found. Luckily, we -# could move the definition into the source file to solve the problem. -list(LENGTH NAMESPACES ns_count) -if("${FILENAME}" MATCHES ".pb.h" AND ns_count EQUAL 1) - string(REPLACE ".pb.h" ".pb.cc" SOURCE_FILENAME ${FILENAME}) - file(READ ${SOURCE_FILENAME} content_cc_origin) - - string(REGEX MATCHALL "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept[^}]*}" content_cc "${content}") - string(REGEX REPLACE "};" "}\n" content_cc "${content_cc}") - string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept" " \\1::\\1(\\1&& from) noexcept" content_cc "${content_cc}") - set(content_cc "${content_cc_origin}\nnamespace ${NAMESPACES} {\n#if LANG_CXX11\n${content_cc}\n#endif\n}") - - string(REGEX REPLACE "([a-zA-Z_]+)\\([a-zA-Z_]+&& from\\) noexcept([^}]*)}" "\\1(\\1&& from) noexcept;" content "${content}") - - file(WRITE ${SOURCE_FILENAME} "${content_cc}") -endif() - file(WRITE ${FILENAME} "${content}") diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 3d4da7f06176..9d848c60c987 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -44,6 +44,7 @@ function(caffe2_print_configuration_summary) message(STATUS " Python site-packages: ${PYTHON_SITE_PACKAGES}") endif() message(STATUS " BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") + message(STATUS " CAFFE2_USE_MSVC_STATIC_RUNTIME : ${CAFFE2_USE_MSVC_STATIC_RUNTIME}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " BUILD_JNI : ${BUILD_JNI}") message(STATUS " BUILD_MOBILE_AUTOGRAD : ${BUILD_MOBILE_AUTOGRAD}") diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake index 8b60915f7e00..c9ac37783d1c 100644 --- a/cmake/public/cuda.cmake +++ b/cmake/public/cuda.cmake @@ -478,7 +478,7 @@ foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration endforeach() # Set C++14 support -set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Werror") +set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror") if(MSVC) list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call") list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward") @@ -490,7 +490,7 @@ endif() # OpenMP flags for NVCC with Clang-cl if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC" AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Xclang" "-fopenmp") + list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Xclang" "-fopenmp") if(MSVC_TOOLSET_VERSION LESS 142) list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp") else() diff --git a/docker.Makefile b/docker.Makefile index ba53b94d7898..18acced1de8d 100644 --- a/docker.Makefile +++ b/docker.Makefile @@ -9,7 +9,7 @@ DOCKER_ORG = $(shell whoami) endif BASE_RUNTIME = ubuntu:18.04 -BASE_DEVEL = nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 +BASE_DEVEL = nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 # The conda channel to use to install pytorch / torchvision INSTALL_CHANNEL = pytorch diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst index 2bfbe63f47c6..39c63ddd5d7b 100644 --- a/docs/cpp/source/index.rst +++ b/docs/cpp/source/index.rst @@ -1,20 +1,20 @@ PyTorch C++ API =============== -These pages provide documentation for the public portions of the PyTorch C++ +These pages provide the documentation for the public portions of the PyTorch C++ API. This API can roughly be divided into five parts: -- **ATen**: The foundational tensor and mathematical operation library on which all else is built; -- **Autograd**: Augments ATen with automatic differentiation; -- **C++ Frontend**: High level constructs for training and evaluation of machine learning models; -- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter; +- **ATen**: The foundational tensor and mathematical operation library on which all else is built. +- **Autograd**: Augments ATen with automatic differentiation. +- **C++ Frontend**: High level constructs for training and evaluation of machine learning models. +- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter. - **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines. -Together, these building blocks form a research and +Combining, these building blocks form a research and production ready C++ library for tensor computation and dynamic neural networks with strong emphasis on GPU acceleration as well as fast CPU performance. It is currently in use at Facebook in research and -production; we look forward to welcoming more users of the PyTorch C++ API. +production; we are looking forward to welcome more users of the PyTorch C++ API. .. warning:: @@ -76,7 +76,7 @@ C++ Frontend ------------ The PyTorch C++ frontend provides a high level, pure C++ modeling interface for -neural network and general machine learning research and production use cases, +neural network and general ML(Machine Learning) research and production use cases, largely following the Python API in design and provided functionality. The C++ frontend includes the following: @@ -119,7 +119,7 @@ expanded on a continuous and active basis. TorchScript ----------- -TorchScript a representation of a PyTorch model that can be understood, +TorchScript is a representation of a PyTorch model that can be understood, compiled and serialized by the TorchScript compiler. Fundamentally, TorchScript is a programming language in its own right. It is a subset of Python using the PyTorch API. The C++ interface to TorchScript encompasses three primary pieces of @@ -150,7 +150,7 @@ CUDA to accelerate research in vanilla PyTorch setups. The C++ extension API does not add any new functionality to the PyTorch C++ API. Instead, it provides integration with Python setuptools as well as JIT compilation mechanisms that allow access to ATen, the autograd and other C++ APIs from -Python. To learn more about the C++ extension API, see +Python. To learn more about the C++ extension API, go through `this tutorial `_. Contents @@ -183,4 +183,4 @@ Acknowledgements This documentation website for the PyTorch C++ universe has been enabled by the `Exhale `_ project and generous investment of time and effort by its maintainer, `svenevs `_. -We thank Stephen for his work and his help with the PyTorch C++ documentation. +We thank Stephen for his work and his efforts providing help with the PyTorch C++ documentation. diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst index c152ca616571..f346fbe994e6 100644 --- a/docs/source/community/persons_of_interest.rst +++ b/docs/source/community/persons_of_interest.rst @@ -25,7 +25,6 @@ torch.* torch.nn ~~~~~~~~ -- Thomas Viehmann (`t-vi `__) - Adam Paszke (`apaszke `__) - Greg Chanan (`gchanan `__) - Soumith Chintala (`soumith `__) diff --git a/docs/source/data.rst b/docs/source/data.rst index 9ba88f02c31f..c5d6f61b7ba9 100644 --- a/docs/source/data.rst +++ b/docs/source/data.rst @@ -403,6 +403,7 @@ Example:: .. autoclass:: TensorDataset .. autoclass:: ConcatDataset .. autoclass:: ChainDataset +.. autoclass:: BufferedShuffleDataset .. autoclass:: Subset .. autofunction:: torch.utils.data.get_worker_info .. autofunction:: torch.utils.data.random_split diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index a248d3e4ca83..c83b5a1d34de 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -52,12 +52,22 @@ MPI supports CUDA only if the implementation used to build PyTorch supports it. Backends that come with PyTorch ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -PyTorch distributed currently only supports Linux. By default, the Gloo and NCCL backends -are built and included in PyTorch distributed (NCCL only when building with CUDA). -MPI is an -optional backend that can only be included if you build PyTorch from source. (e.g. -building PyTorch on a host that has MPI installed.) +PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype). +By default for Linux, the Gloo and NCCL backends are built and included in PyTorch +distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be +included if you build PyTorch from source. (e.g.building PyTorch on a host that has MPI +installed.) +.. warning :: + As of PyTorch v1.7, Windows support for the distributed package only covers collective + communications with Gloo backend, `FileStore`, and `DistributedDataParallel`. Therefore, + the `init_method` argument in :func:`init_process_group` must point to a file. This works + for both local and shared file systems: + + - Local file system, ``init_method="file:///d:/tmp/some_file"`` + - Shared file system, ``init_method="file://////{machine_name}/{share_folder_name}/some_file"`` + + Similarly, if you directly pass in a `store` argument, it must be a ``FileStore`` instance. Which backend to use? ^^^^^^^^^^^^^^^^^^^^^ @@ -260,6 +270,31 @@ The machine with rank 0 will be used to set up all connections. This is the default method, meaning that ``init_method`` does not have to be specified (or can be ``env://``). +Distributed Key-Value Store +--------------------------- + +The distributed package comes with a distributed key-value store, which can be +used to share information between processes in the group as well as to +initialize the distributed pacakge in +:func:`torch.distributed.init_process_group` (by explicitly creating the store +as an alternative to specifying ``init_method``.) There are 3 choices for +Key-Value Stores: :class:`~torch.distributed.TCPStore`, +:class:`~torch.distributed.FileStore`, and :class:`~torch.distributed.HashStore`. + +.. autoclass:: Store +.. autoclass:: TCPStore +.. autoclass:: HashStore +.. autoclass:: FileStore +.. autoclass:: PrefixStore + +.. autofunction:: torch.distributed.Store.set +.. autofunction:: torch.distributed.Store.get +.. autofunction:: torch.distributed.Store.add +.. autofunction:: torch.distributed.Store.wait +.. autofunction:: torch.distributed.Store.num_keys +.. autofunction:: torch.distributed.Store.delete_key +.. autofunction:: torch.distributed.Store.set_timeout + Groups ------ diff --git a/docs/source/fft.rst b/docs/source/fft.rst index 8ec06a3574d2..ab50bd271d32 100644 --- a/docs/source/fft.rst +++ b/docs/source/fft.rst @@ -1,6 +1,8 @@ .. role:: hidden :class: hidden-section +.. _torch-fft-module: + torch.fft ========= @@ -19,7 +21,11 @@ Functions .. autofunction:: fft .. autofunction:: ifft +.. autofunction:: fftn +.. autofunction:: ifftn .. autofunction:: rfft .. autofunction:: irfft +.. autofunction:: rfftn +.. autofunction:: irfftn .. autofunction:: hfft .. autofunction:: ihfft diff --git a/docs/source/jit_language_reference.rst b/docs/source/jit_language_reference.rst index 4cca46fdc005..205195f59f6b 100644 --- a/docs/source/jit_language_reference.rst +++ b/docs/source/jit_language_reference.rst @@ -72,6 +72,7 @@ net models. In particular, TorchScript supports: "``Optional[T]``", "A value which is either None or type ``T``" "``Dict[K, V]``", "A dict with key type ``K`` and value type ``V``. Only ``str``, ``int``, and ``float`` are allowed as key types." "``T``", "A `TorchScript Class`_" + "``E``", "A `TorchScript Enum`_" "``NamedTuple[T0, T1, ...]``", "A :func:`collections.namedtuple ` tuple type" Unlike Python, each variable in TorchScript function must have a single static type. @@ -271,6 +272,7 @@ Example (refining types on parameters and locals): module = torch.jit.script(M(2)) module = torch.jit.script(M(None)) + .. _TorchScript Class: .. _TorchScript Classes: .. _torchscript-classes: @@ -346,6 +348,37 @@ like any other TorchScript type: print(sum_pair(p)) +.. _TorchScript Enum: +.. _TorchScript Enums: +.. _torchscript-enums: + +TorchScript Enums +^^^^^^^^^^^^^^^^^^^ + +Python enums can be used in TorchScript without any extra annotation or code: + +:: + + from enum import Enum + + + class Color(Enum): + RED = 1 + GREEN = 2 + + @torch.jit.script + def enum_fn(x: Color, y: Color) -> bool: + if x == Color.RED: + return True + + return x == y + +After an enum is defined, it can be used in both TorchScript and Python interchangeably +like any other TorchScript type. The type of the values of an enum must be ``int``, +``float``, or ``str``. All values must be of the same type; heterogenous types for enum +values are not supported. + + Named Tuples ^^^^^^^^^^^^ Types produced by :func:`collections.namedtuple ` can be used in TorchScript. diff --git a/docs/source/jit_unsupported.rst b/docs/source/jit_unsupported.rst index 8bf3e78d672a..7368abad1e30 100644 --- a/docs/source/jit_unsupported.rst +++ b/docs/source/jit_unsupported.rst @@ -87,6 +87,5 @@ we suggest using :meth:`torch.jit.trace`. * :class:`torch.nn.RNN` * :class:`torch.nn.AdaptiveLogSoftmaxWithLoss` * :class:`torch.autograd.Function` - * :class:`torch.autograd.no_grad` * :class:`torch.autograd.enable_grad` * :class:`torch.Generator` diff --git a/docs/source/name_inference.rst b/docs/source/name_inference.rst index 7fc84e092633..ccbb8c0c54d3 100644 --- a/docs/source/name_inference.rst +++ b/docs/source/name_inference.rst @@ -197,6 +197,8 @@ If you don't see an operation listed here, but it would help your use case, plea :meth:`Tensor.sigmoid_`,None ":meth:`Tensor.sign`, :func:`torch.sign`",:ref:`keeps_input_names-doc` :meth:`Tensor.sign_`,None + ":meth:`Tensor.sgn`, :func:`torch.sgn`",:ref:`keeps_input_names-doc` + :meth:`Tensor.sgn_`,None ":meth:`Tensor.sin`, :func:`torch.sin`",:ref:`keeps_input_names-doc` :meth:`Tensor.sin_`,None ":meth:`Tensor.sinh`, :func:`torch.sinh`",:ref:`keeps_input_names-doc` diff --git a/docs/source/nn.functional.rst b/docs/source/nn.functional.rst index eb88b50e6d56..416121cec8d6 100644 --- a/docs/source/nn.functional.rst +++ b/docs/source/nn.functional.rst @@ -483,6 +483,11 @@ Loss functions .. autofunction:: triplet_margin_loss +:hidden:`triplet_margin_with_distance_loss` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: triplet_margin_with_distance_loss + Vision functions ---------------- @@ -533,5 +538,3 @@ DataParallel functions (multi-GPU, distributed) ~~~~~~~~~~~~~~~~~~~~~~~ .. autofunction:: torch.nn.parallel.data_parallel - - diff --git a/docs/source/nn.rst b/docs/source/nn.rst index 3a6cb7e19316..8d195c04037c 100644 --- a/docs/source/nn.rst +++ b/docs/source/nn.rst @@ -10,7 +10,7 @@ These are the basic building block for graphs :depth: 2 :local: :backlinks: top - + .. currentmodule:: torch.nn @@ -269,6 +269,7 @@ Loss Functions nn.CosineEmbeddingLoss nn.MultiMarginLoss nn.TripletMarginLoss + nn.TripletMarginWithDistanceLoss Vision Layers ---------------- diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst index 230426be8695..a34b0d7231fb 100644 --- a/docs/source/notes/cuda.rst +++ b/docs/source/notes/cuda.rst @@ -214,6 +214,10 @@ complete snapshot of the memory allocator state via :meth:`~torch.cuda.memory_snapshot`, which can help you understand the underlying allocation patterns produced by your code. +Use of a caching allocator can interfere with memory checking tools such as +``cuda-memcheck``. To debug memory errors using ``cuda-memcheck``, set +``PYTORCH_NO_CUDA_MEMORY_CACHING=1`` in your environment to disable caching. + .. _cufft-plan-cache: cuFFT plan cache diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index ea45a2d7070a..3c07486b0e89 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -231,6 +231,25 @@ The dynamic control flow is captured correctly. We can verify in backends with d # [37, 37, 37]], dtype=int64)] +To avoid exporting a variable scalar tensor as a fixed value constant as part of the ONNX model, please +avoid use of ``torch.Tensor.item()``. Torch supports implicit cast of single-element tensors to numbers. +E.g.: :: + + class LoopModel(torch.nn.Module): + def forward(self, x, y): + res = [] + arr = x.split(2, 0) + for i in range(int(y)): + res += [arr[i].sum(0, False)] + return torch.stack(res) + + model = torch.jit.script(LoopModel()) + inputs = (torch.randn(16), torch.tensor(8)) + + out = model(*inputs) + torch.onnx.export(model, inputs, 'loop_and_list.onnx', opset_version=11, example_outputs=out) + + TorchVision support ------------------- @@ -262,6 +281,7 @@ The following operators are supported: * Conv * Dropout * Embedding (no optional arguments supported) +* EmbeddingBag * FeatureDropout (training mode not supported) * Index * MaxPool1d @@ -289,6 +309,7 @@ The following operators are supported: * avg_pool2d * avg_pool2d * avg_pool3d +* as_strided * baddbmm * bitshift * cat @@ -314,6 +335,7 @@ The following operators are supported: * exp * expand * expand_as +* eye * flatten * floor * floor_divide @@ -335,9 +357,11 @@ The following operators are supported: * instance_norm * interpolate * isnan +* KLDivLoss * layer_norm * le * leaky_relu +* len * log * log1p * log2 @@ -358,6 +382,9 @@ The following operators are supported: * narrow * ne * neg +* new_empty +* new_full +* new_zeros * nll_loss * nonzero * norm @@ -811,7 +838,10 @@ Q: Is tensor list exportable to ONNX? Yes, this is supported now for ONNX opset version >= 11. ONNX introduced the concept of Sequence in opset 11. Similar to list, Sequence is a data type that contains arbitrary number of Tensors. - Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc. E.g.: :: + Associated operators are also introduced in ONNX, such as SequenceInsert, SequenceAt, etc. + However, in-place list append within loops is not exportable to ONNX. To implement this, please use inplace + add operator. + E.g.: :: class ListLoopModel(torch.nn.Module): def forward(self, x): @@ -820,8 +850,8 @@ Q: Is tensor list exportable to ONNX? arr = x.split(2, 0) res2 = torch.zeros(3, 4, dtype=torch.long) for i in range(len(arr)): - res = res.append(arr[i].sum(0, False)) - res1 = res1.append(arr[-1 - i].sum(0, False)) + res += [arr[i].sum(0, False)] + res1 += [arr[-1 - i].sum(0, False)] res2 += 1 return torch.stack(res), torch.stack(res1), res2 diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index b597fa9f51f3..b78ed2c08586 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -77,6 +77,261 @@ The corresponding implementation is chosen automatically based on the PyTorch bu ``torch.backends.quantized.engine = 'qnnpack'`` +Quantization API Summary +--------------------------------------- + +There are three types of quantization supported in PyTorch: + +1. dynamic quantization (weights quantized with activations read/stored in + floating point and quantized for compute.) +2. static quantization (weights quantized, activations quantized, calibration + required post training) +3. quantization aware training (weights quantized, activations quantized, + quantization numerics modeled during training) + +Please see our `Introduction to Quantization on Pytorch +`_ blog post +for a more comprehensive overview of the tradeoffs between these quantization +types. + +Dynamic Quantization +^^^^^^^^^^^^^^^^^^^^ + +This is the simplest to apply form of quantization where the weights are +quantized ahead of time but the activations are dynamically quantized +during inference. This is used for situations where the model execution time +is dominated by loading weights from memory rather than computing the matrix +multiplications. This is true for for LSTM and Transformer type models with +small batch size. + +Diagram:: + + # original model + # all tensors and computations are in floating point + previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32 + / + linear_weight_fp32 + + # dynamically quantized model + # linear and conv weights are in int8 + previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32 + / + linear_weight_int8 + +API example:: + + import torch + + # define a floating point model + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.fc = torch.nn.Linear(4, 4) + + def forward(self, x): + x = self.fc(x) + return x + + # create a model instance + model_fp32 = M() + # create a quantized model instance + model_int8 = torch.quantization.quantize_dynamic( + model_fp32, # the original model + {torch.nn.Linear}, # a set of layers to dynamically quantize + dtype=torch.qint8) # the target dtype for quantized weights + + # run the model + input_fp32 = torch.randn(4, 4, 4, 4) + res = model_int8(input_fp32) + +To learn more about dynamic quantization please see our `dynamic quantization tutorial +`_. + +Static Quantization +^^^^^^^^^^^^^^^^^^^^ + +Static quantization quantizes the weights and activations of the model. It +fuses activations into preceding layers where possible. It requires +calibration with a representative dataset to determine optimal quantization +parameters for activations. Post Training Quantization is typically used when +both memory bandwidth and compute savings are important with CNNs being a +typical use case. Static quantization is also known as Post Training +Quantization or PTQ. + +Diagram:: + + # original model + # all tensors and computations are in floating point + previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32 + / + linear_weight_fp32 + + # statically quantized model + # weights and activations are in int8 + previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8 + / + linear_weight_int8 + +API Example:: + + import torch + + # define a floating point model where some layers could be statically quantized + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + # QuantStub converts tensors from floating point to quantized + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.relu = torch.nn.ReLU() + # DeQuantStub converts tensors from quantized to floating point + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + # manually specify where tensors will be converted from floating + # point to quantized in the quantized model + x = self.quant(x) + x = self.conv(x) + x = self.relu(x) + # manually specify where tensors will be converted from quantized + # to floating point in the quantized model + x = self.dequant(x) + return x + + # create a model instance + model_fp32 = M() + + # model must be set to eval mode for static quantization logic to work + model_fp32.eval() + + # attach a global qconfig, which contains information about what kind + # of observers to attach. Use 'fbgemm' for server inference and + # 'qnnpack' for mobile inference. Other quantization configurations such + # as selecting symmetric or assymetric quantization and MinMax or L2Norm + # calibration techniques can be specified here. + model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm') + + # Fuse the activations to preceding layers, where applicable. + # This needs to be done manually depending on the model architecture. + # Common fusions include `conv + relu` and `conv + batchnorm + relu` + model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']]) + + # Prepare the model for static quantization. This inserts observers in + # the model that will observe activation tensors during calibration. + model_fp32_prepared = torch.quantization.prepare(model_fp32_fused) + + # calibrate the prepared model to determine quantization parameters for activations + # in a real world setting, the calibration would be done with a representative dataset + input_fp32 = torch.randn(4, 1, 4, 4) + model_fp32_prepared(input_fp32) + + # Convert the observed model to a quantized model. This does several things: + # quantizes the weights, computes and stores the scale and bias value to be + # used with each activation tensor, and replaces key operators with quantized + # implementations. + model_int8 = torch.quantization.convert(model_fp32_prepared) + + # run the model, relevant calculations will happen in int8 + res = model_int8(input_fp32) + +To learn more about static quantization, please see the `static quantization tutorial +`_. + +Quantization Aware Training +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Quantization Aware Training models the effects of quantization during training +allowing for higher accuracy compared to other quantization methods. During +training, all calculations are done in floating point, with fake_quant modules +modeling the effects of quantization by clamping and rounding to simulate the +effects of INT8. After model conversion, weights and +activations are quantized, and activations are fused into the preceding layer +where possible. It is commonly used with CNNs and yields a higher accuracy +compared to static quantization. Quantization Aware Training is also known as +QAT. + +Diagram:: + + # original model + # all tensors and computations are in floating point + previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32 + / + linear_weight_fp32 + + # model with fake_quants for modeling quantization numerics during training + previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32 + / + linear_weight_fp32 -- fq + + # quantized model + # weights and activations are in int8 + previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8 + / + linear_weight_int8 + +API Example:: + + import torch + + # define a floating point model where some layers could benefit from QAT + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + # QuantStub converts tensors from floating point to quantized + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.bn = torch.nn.BatchNorm2d(1) + self.relu = torch.nn.ReLU() + # DeQuantStub converts tensors from quantized to floating point + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + x = self.dequant(x) + return x + + # create a model instance + model_fp32 = M() + + # model must be set to train mode for QAT logic to work + model_fp32.train() + + # attach a global qconfig, which contains information about what kind + # of observers to attach. Use 'fbgemm' for server inference and + # 'qnnpack' for mobile inference. Other quantization configurations such + # as selecting symmetric or assymetric quantization and MinMax or L2Norm + # calibration techniques can be specified here. + model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') + + # fuse the activations to preceding layers, where applicable + # this needs to be done manually depending on the model architecture + model_fp32_fused = torch.quantization.fuse_modules(model_fp32, + [['conv', 'bn', 'relu']]) + + # Prepare the model for QAT. This inserts observers and fake_quants in + # the model that will observe weight and activation tensors during calibration. + model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused) + + # run the training loop (not shown) + training_loop(model_fp32_prepared) + + # Convert the observed model to a quantized model. This does several things: + # quantizes the weights, computes and stores the scale and bias value to be + # used with each activation tensor, fuses modules where appropriate, + # and replaces key operators with quantized implementations. + model_fp32_prepared.eval() + model_int8 = torch.quantization.convert(model_fp32_prepared) + + # run the model, relevant calculations will happen in int8 + res = model_int8(input_fp32) + +To learn more about quantization aware training, please see the `QAT +tutorial +`_. + Quantized Tensors --------------------------------------- @@ -121,79 +376,8 @@ cover typical CNN and RNN models torch.nn.quantized torch.nn.quantized.dynamic -Quantization Workflows ----------------------- - -PyTorch provides three approaches to quantize models. - -.. _quantization tutorials: - https://pytorch.org/tutorials/#quantization-experimental - -1. Post Training Dynamic Quantization: This is the simplest to apply form of - quantization where the weights are quantized ahead of time but the - activations are dynamically quantized during inference. This is used - for situations where the model execution time is dominated by loading - weights from memory rather than computing the matrix multiplications. - This is true for for LSTM and Transformer type models with small - batch size. Applying dynamic quantization to a whole model can be - done with a single call to :func:`torch.quantization.quantize_dynamic()`. - See the `quantization tutorials`_ -2. Post Training Static Quantization: This is the most commonly used form of - quantization where the weights are quantized ahead of time and the - scale factor and bias for the activation tensors is pre-computed - based on observing the behavior of the model during a calibration - process. Post Training Quantization is typically when both memory bandwidth - and compute savings are important with CNNs being a typical use case. - The general process for doing post training quantization is: - - - - 1. Prepare the model: - - a. Specify where the activations are quantized and dequantized explicitly - by adding QuantStub and DeQuantStub modules. - b. Ensure that modules are not reused. - c. Convert any operations that require requantization into modules - - 2. Fuse operations like conv + relu or conv+batchnorm + relu together to - improve both model accuracy and performance. - - 3. Specify the configuration of the quantization methods \'97 such as - selecting symmetric or asymmetric quantization and MinMax or - L2Norm calibration techniques. - 4. Use the :func:`torch.quantization.prepare` to insert modules - that will observe activation tensors during calibration - 5. Calibrate the model by running inference against a calibration - dataset - 6. Finally, convert the model itself with the - torch.quantization.convert() method. This does several things: it - quantizes the weights, computes and stores the scale and bias - value to be used each activation tensor, and replaces key - operators quantized implementations. - - See the `quantization tutorials`_ - - -3. Quantization Aware Training: In the rare cases where post training - quantization does not provide adequate accuracy training can be done - with simulated quantization using the - :class:`torch.quantization.FakeQuantize`. Computations will take place in - FP32 but with values clamped and rounded to simulate the effects of INT8 - quantization. The sequence of steps is very similar. - - - 1. Steps (1) and (2) are identical. - - 3. Specify the configuration of the fake quantization methods \'97 such as - selecting symmetric or asymmetric quantization and MinMax or Moving Average - or L2Norm calibration techniques. - 4. Use the :func:`torch.quantization.prepare_qat` to insert modules - that will simulate quantization during training. - 5. Train or fine tune the model. - 6. Identical to step (6) for post training quantization - - See the `quantization tutorials`_ - +Quantization Customizations +--------------------------- While default implementations of observers to select the scale factor and bias based on observed tensor data are provided, developers can provide their own @@ -218,9 +402,15 @@ prior to quantization. This is because currently quantization works on a module by module basis. Specifically, for all quantization techniques, the user needs to: 1. Convert any operations that require output requantization (and thus have - additional parameters) from functionals to module form. + additional parameters) from functionals to module form (for example, + using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``). 2. Specify which parts of the model need to be quantized either by assigning - ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict`` + ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``. + For example, setting ``model.conv1.qconfig = None`` means that the + ``model.conv`` layer will not be quantized, and setting + ``model.linear1.qconfig = custom_qconfig`` means that the quantization + settings for ``model.linear1`` will be using ``custom_qconfig`` instead + of the global qconfig. For static quantization techniques which quantize activations, the user needs to do the following in addition: @@ -238,6 +428,13 @@ to do the following in addition: to be fused. We currently support the following fusions: [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu] +Best Practices +-------------- + +1. Set the ``reduce_range`` argument on observers to `True` if you are using the + ``fbgemm`` backend. This argument prevents overflow on some int8 instructions + by reducing the range of quantized data type by 1 bit. + Modules that provide quantization functions and classes ------------------------------------------------------- diff --git a/docs/source/rpc.rst b/docs/source/rpc.rst index 37adc14faae1..1d786710d15c 100644 --- a/docs/source/rpc.rst +++ b/docs/source/rpc.rst @@ -113,8 +113,6 @@ and move it to the desired devices on the callee if necessary. The RPC package also provides decorators which allow applications to specify how a given function should be treated on the callee side. -.. warning:: - The ``rpc.functions`` package is a prototype feature and subject to change. .. autofunction:: torch.distributed.rpc.functions.async_execution @@ -142,9 +140,6 @@ to configure the backend's behavior. TensorPipe Backend """""""""""""""""" -.. warning:: - The TensorPipe backend is a **beta feature**. - The TensorPipe agent, which is the default, leverages `the TensorPipe library `_, which provides a natively point-to-point communication primitive specifically suited for machine learning @@ -192,6 +187,10 @@ Example:: Process Group Backend """"""""""""""""""""" +.. warning :: + The Process Group Backend will be deprecated soon, we recommend using the + TensorPipe Backend instead. + The Process Group agent instantiates a process group from the :mod:`~torch.distributed` module and utilizes its point-to-point communication capabilities to send RPC messages. Internally, the process @@ -293,8 +292,13 @@ The RRef design note covers the design of the :ref:`rref` (Remote REFerence) pro Tutorials --------- -The RPC tutorial introduces users to the RPC framework and provides two example applications using :ref:`torch.distributed.rpc` APIs. +The RPC tutorials introduce users to the RPC framework, provide several example applications +using :ref:`torch.distributed.rpc` APIs, and demonstrate how +to use `the profiler `__ to profile RPC-based workloads. - `Getting started with Distributed RPC Framework `__ - `Implementing a Parameter Server using Distributed RPC Framework `__ - `Combining Distributed DataParallel with Distributed RPC Framework `__ +- `Profiling RPC-based Workloads `__ +- `Implementing batch RPC processing `__ +- `Distributed Pipeline Parallel `__ diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index d7a94711e76b..94b1fb25f58e 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -8,7 +8,7 @@ torch.Tensor A :class:`torch.Tensor` is a multi-dimensional matrix containing elements of a single data type. -Torch defines 10 tensor types with CPU and GPU variants: +Torch defines 10 tensor types with CPU and GPU variants which are as follows: ========================== =========================================== ============================= ================================ Data type dtype CPU tensor GPU tensor @@ -32,7 +32,7 @@ Boolean ``torch.bool`` :class: Sometimes referred to as binary16: uses 1 sign, 5 exponent, and 10 significand bits. Useful when precision is important at the expense of range. .. [2] - Sometimes referred to as Brain Floating Point: use 1 sign, 8 exponent and 7 + Sometimes referred to as Brain Floating Point: uses 1 sign, 8 exponent, and 7 significand bits. Useful when range is important, since it has the same number of exponent bits as ``float32`` @@ -453,6 +453,8 @@ view of a storage and defines numeric operations on it. .. automethod:: narrow .. automethod:: narrow_copy .. automethod:: ndimension + .. automethod:: nan_to_num + .. automethod:: nan_to_num_ .. automethod:: ne .. automethod:: ne_ .. automethod:: not_equal @@ -532,6 +534,8 @@ view of a storage and defines numeric operations on it. .. automethod:: sign .. automethod:: sign_ .. automethod:: signbit + .. automethod:: sgn + .. automethod:: sgn_ .. automethod:: sin .. automethod:: sin_ .. automethod:: sinh diff --git a/docs/source/torch.rst b/docs/source/torch.rst index beab6c449df1..d0537947d4ff 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -312,6 +312,7 @@ Pointwise Ops mul multiply mvlgamma + nan_to_num neg negative nextafter @@ -536,3 +537,4 @@ Utilities set_deterministic is_deterministic vmap + Assert diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec index 17e9fb26afa1..f74e2dc9f37e 100644 --- a/ios/LibTorch.podspec +++ b/ios/LibTorch.podspec @@ -1,6 +1,6 @@ Pod::Spec.new do |s| s.name = 'LibTorch' - s.version = '1.6.0' + s.version = '1.6.1' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/pytorch' diff --git a/mypy.ini b/mypy.ini index a7d4acea9571..ea7bdb1a83ed 100644 --- a/mypy.ini +++ b/mypy.ini @@ -53,45 +53,24 @@ ignore_errors = True [mypy-torch.distributed.*] ignore_errors = True -[mypy-torch.testing._internal.codegen.*] -ignore_errors = True - -[mypy-torch.testing._internal.autocast_test_lists.*] -ignore_errors = True - [mypy-torch.testing._internal.hypothesis_utils.*] ignore_errors = True -[mypy-torch.testing._internal.common_methods_invocations.*] -ignore_errors = True - [mypy-torch.testing._internal.common_nn.*] ignore_errors = True [mypy-torch.testing._internal.common_quantization.*] ignore_errors = True -[mypy-torch.testing._internal.common_utils.*] -ignore_errors = True - [mypy-torch.testing._internal.generated.*] ignore_errors = True [mypy-torch.testing._internal.distributed.*] ignore_errors = True -[mypy-torch.quantization.observer] -ignore_errors = True - [mypy-torch.quantization.stubs] ignore_errors = True -[mypy-torch.quantization.fake_quantize] -ignore_errors = True - -[mypy-torch.quantization.quantize_jit] -ignore_errors = True - [mypy-torch.quantization._numeric_suite] ignore_errors = True @@ -102,15 +81,9 @@ ignore_errors = True [mypy-torch.quantization.fx.*] ignore_errors = True -[mypy-torch.quasirandom] -ignore_errors = True - [mypy-torch.distributions.*] ignore_errors = True -[mypy-torch.tensor] -ignore_errors = True - [mypy-torch._tensor_str] ignore_errors = True @@ -159,21 +132,6 @@ ignore_errors = True [mypy-torch.nn.parallel.comm] ignore_errors = True -[mypy-torch.nn.quantized.functional] -ignore_errors = True - -[mypy-torch.nn.quantized.modules] -ignore_errors = True - -[mypy-torch.nn.quantized.modules.activation] -ignore_errors = True - -[mypy-torch.nn.quantized.modules.normalization] -ignore_errors = True - -[mypy-torch.nn.quantized.modules.utils] -ignore_errors = True - [mypy-torch.nn.qat.modules.activations] ignore_errors = True @@ -186,21 +144,9 @@ ignore_errors = True [mypy-torch.nn.quantized.modules.conv] ignore_errors = True -[mypy-torch.nn.quantized.modules.functional_modules] -ignore_errors = True - [mypy-torch.cuda] ignore_errors = True -[mypy-torch.cuda.amp.*] -ignore_errors = True - -[mypy-torch.cuda.comm] -ignore_errors = True - -[mypy-torch.cuda.nccl] -ignore_errors = True - [mypy-torch._lobpcg] ignore_errors = True @@ -222,12 +168,6 @@ ignore_errors = True [mypy-torch.contrib._tensorboard_vis] ignore_errors = True -[mypy-torch.utils.data._utils.worker] -ignore_errors = True - -[mypy-torch.utils.data.distributed] -ignore_errors = True - [mypy-torch.nn.utils.prune] ignore_errors = True diff --git a/scripts/get_python_cmake_flags.py b/scripts/get_python_cmake_flags.py index 0fac6d20d4d4..9121c5ebf0db 100644 --- a/scripts/get_python_cmake_flags.py +++ b/scripts/get_python_cmake_flags.py @@ -12,9 +12,9 @@ # make # -from __future__ import absolute_import -from __future__ import unicode_literals -from __future__ import print_function + + + from distutils import sysconfig import sys diff --git a/scripts/onnx/test.sh b/scripts/onnx/test.sh index 77f9c8b9f16e..8b6fc6c4cf63 100755 --- a/scripts/onnx/test.sh +++ b/scripts/onnx/test.sh @@ -70,4 +70,6 @@ if [[ "$BUILD_ENVIRONMENT" == *ort_test2* ]]; then pytest "${args[@]}" \ "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset$i" done + pytest "${args[@]}" \ + "$top_dir/test/onnx/test_pytorch_onnx_onnxruntime.py::TestONNXRuntime_opset12_onnx_shape_inference" fi diff --git a/scripts/xcode_build.rb b/scripts/xcode_build.rb index 801ad34a64fd..810c23352fdd 100644 --- a/scripts/xcode_build.rb +++ b/scripts/xcode_build.rb @@ -62,10 +62,13 @@ project.save sdk = nil +arch = nil if options[:platform] == 'SIMULATOR' sdk = 'iphonesimulator' + arch = 'x86_64' elsif options[:platform] == 'OS' sdk = 'iphoneos' + arch = 'arm64' else raise "unsupported platform #{options[:platform]}" end @@ -76,4 +79,5 @@ end # run xcodebuild -exec "xcodebuild clean build -project #{xcodeproj_path} -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile}" +exec "xcodebuild clean build -project #{xcodeproj_path} -target #{target.name} -sdk #{sdk} -configuration Release PROVISIONING_PROFILE_SPECIFIER=#{profile} -arch #{arch}" + diff --git a/setup.py b/setup.py index 2a2f911e0d3d..c29ee929b8ca 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,9 @@ # BUILD_CAFFE2_OPS=0 # disable Caffe2 operators build # +# BUILD_CAFFE2=0 +# disable Caffe2 build +# # USE_IBVERBS # toggle features related to distributed support # @@ -162,7 +165,7 @@ # When turned on, the following cmake variables will be toggled as well: # USE_SYSTEM_CPUINFO=ON USE_SYSTEM_SLEEF=ON BUILD_CUSTOM_PROTOBUF=OFF -from __future__ import print_function + import sys if sys.version_info < (3,): print("Python 2 has reached end-of-life and is no longer supported by PyTorch.") @@ -340,7 +343,11 @@ def check_file(f): ################################################################################ # the list of runtime dependencies required by this built package -install_requires = ['future', 'typing_extensions', 'dataclasses'] +install_requires = [ + 'future', + 'typing_extensions', + 'dataclasses; python_version < "3.7"' +] missing_pydep = ''' Missing build dependency: Unable to `import {importname}`. @@ -776,6 +783,10 @@ def print_box(msg): 'include/ATen/detail/*.h', 'include/ATen/native/*.h', 'include/ATen/native/cpu/*.h', + 'include/ATen/native/cuda/*.h', + 'include/ATen/native/cuda/*.cuh', + 'include/ATen/native/hip/*.h', + 'include/ATen/native/hip/*.cuh', 'include/ATen/native/quantized/*.h', 'include/ATen/native/quantized/cpu/*.h', 'include/ATen/quantized/*.h', diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index d5cbe5a884a9..a2f843d78f72 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -1,4 +1,3 @@ - import argparse import datetime import re @@ -28,6 +27,8 @@ # NB: function name DOES NOT include overload name! allow_list = [ ("c10_experimental", datetime.date(2222, 1, 1)), + # Internal + ("static", datetime.date(9999, 1, 1)), # Internal, profiler-specific ops ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)), ("profiler::_record_function_enter", datetime.date(9999, 1, 1)), @@ -58,16 +59,16 @@ ("aten::atan2", datetime.date(2020, 7, 30)), ("aten::copy_", datetime.date(2020, 7, 30)), ("aten::sort", datetime.date(2020, 7, 30)), - ('aten::_convolution', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose', datetime.date(2020, 10, 15)), - ('aten::_convolution_double_backward', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_backward_input', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_backward', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_backward_weight', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose_backward', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose_backward_input', datetime.date(2020, 10, 15)), - ('aten::cudnn_convolution_transpose_backward_weight', datetime.date(2020, 10, 15)), + ("aten::_convolution", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose", datetime.date(2020, 10, 15)), + ("aten::_convolution_double_backward", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_backward_input", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_backward", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_backward_weight", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose_backward", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose_backward_input", datetime.date(2020, 10, 15)), + ("aten::cudnn_convolution_transpose_backward_weight", datetime.date(2020, 10, 15)), ("aten::_cudnn_init_dropout_state", datetime.date(2020, 7, 30)), ("aten::sparse_coo_tensor", datetime.date(2020, 7, 30)), ("aten::_sparse_coo_tensor_with_dims", datetime.date(2020, 7, 30)), @@ -90,6 +91,7 @@ ("aten::logspace", datetime.date(2020, 9, 30)), ("aten::logspace.out", datetime.date(2020, 9, 30)), ("__getstate__", datetime.date(2020, 9, 11), "Conv[23]dPackedParams"), + ("_caffe2::LearningRate", datetime.date(2020, 10, 1)), ("aten::_var", datetime.date(2020, 10, 1)), ("aten::_std", datetime.date(2020, 10, 1)), ("aten::_foreach_add_", datetime.date(2020, 10, 1)), @@ -99,6 +101,16 @@ ("preprocess", datetime.date(2020, 10, 1)), ("compile", datetime.date(2020, 10, 1)), ("execute", datetime.date(2020, 10, 1)), + ("aten::_addr", datetime.date(2020, 10, 31)), + ("aten::_addr_", datetime.date(2020, 10, 31)), + ("aten::_addr.out", datetime.date(2020, 10, 31)), + ("aten::_foreach_add", datetime.date(2020, 10, 1)), + ("aten::_foreach_sub_", datetime.date(2020, 10, 1)), + ("aten::_foreach_div", datetime.date(2020, 10, 1)), + ("aten::_foreach_sub", datetime.date(2020, 10, 1)), + ("aten::_amp_non_finite_check_and_unscale_", datetime.date(9999, 1, 1)), + ("aten::choose_qparams_optimized", datetime.date(2020, 10, 5)), + ("aten::smooth_l1_loss_backward", datetime.date(2020, 10, 15)), ] @@ -115,6 +127,7 @@ def allow_listed(schema, allow_list): return True return False + # The nightly will fail to parse newly added syntax to schema declarations # Add new schemas that will fail the nightly here dont_parse_list = [ @@ -122,6 +135,7 @@ def allow_listed(schema, allow_list): ("test_backend", datetime.date(2099, 9, 17)), ] + def dont_parse(schema_line): for item in dont_parse_list: if item[1] < datetime.date.today(): diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp index 4efdb122efc8..707c1bfd7ac0 100644 --- a/test/cpp/api/functional.cpp +++ b/test/cpp/api/functional.cpp @@ -246,6 +246,18 @@ TEST_F(FunctionalTest, SmoothL1LossDefaultOptions) { ASSERT_TRUE(input.sizes() == input.grad().sizes()); } +TEST_F(FunctionalTest, SmoothL1LossBeta) { + auto input = torch::tensor({0.1, 1.5, 10.0}, torch::dtype(torch::kFloat).requires_grad(true)); + auto target = torch::tensor({0., 1., 5.}, torch::kFloat); + auto output = + F::smooth_l1_loss(input, target, /*reduction=*/torch::kMean, /*beta=*/0.5); + auto expected = torch::tensor(1.67, torch::kFloat); + auto s = output.sum(); + s.backward(); + ASSERT_TRUE(output.allclose(expected)); + ASSERT_TRUE(input.sizes() == input.grad().sizes()); +} + TEST_F(FunctionalTest, SmoothL1LossNoReduction) { auto input = torch::tensor({0.1, 1.2, 4.7}, torch::dtype(torch::kFloat).requires_grad(true)); auto target = torch::tensor({0., 1., 5.}, torch::kFloat); @@ -670,6 +682,56 @@ TEST_F(FunctionalTest, TripletMarginLoss) { ASSERT_TRUE(output.allclose(expected, 1e-04)); } +TEST_F(FunctionalTest, TripletMarginWithDistanceLossDefaultParity) { + // Check that if we use torch::pairwise_distance with the default + // TripletMarginLoss options as our distance function, the outputs + // are equal (i.e., equal under defaults). + + std::vector + reductions = {torch::kSum, torch::kMean, torch::kNone}; + std::vector margins = {0.5, 1.0, 1.5}; + std::vector swaps = {true, false}; + + for (auto& reduction : reductions) { + for (auto& margin : margins) { + for (const auto& swap : swaps) { + auto anchor = + torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + auto positive = + torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + auto negative = + torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + + auto basicOptions = F::TripletMarginLossFuncOptions() + .reduction(reduction) + .margin(margin) + .swap(swap); + auto distanceOptions = + F::TripletMarginWithDistanceLossFuncOptions() + .reduction(reduction) + .margin(margin) + .swap(swap); + TripletMarginLoss basicLoss(basicOptions); + TripletMarginWithDistanceLoss distanceLoss(distanceOptions); + + auto basicOutput = + F::triplet_margin_loss(anchor, positive, negative, basicOptions); + auto distanceOutput = F::triplet_margin_with_distance_loss( + anchor, positive, negative, distanceOptions); + + ASSERT_TRUE(distanceOutput.allclose(basicOutput, 1e-6, 1e-6)); + + // handle for torch::kNone reduction + auto sum = distanceOutput.sum(); + sum.backward(); + ASSERT_EQ(anchor.sizes(), anchor.grad().sizes()); + ASSERT_EQ(positive.sizes(), positive.grad().sizes()); + ASSERT_EQ(negative.sizes(), negative.grad().sizes()); + } + } + } +} + TEST_F(FunctionalTest, NLLLoss) { auto input = torch::tensor({{-0.1315, -3.1315, -2.5315}, {-3.7038, -0.1038, -2.6038}, diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 4777cf0b54bc..ef0fc2765551 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -2085,6 +2085,115 @@ TEST_F(ModulesTest, TripletMarginLoss) { ASSERT_EQ(anchor.sizes(), anchor.grad().sizes()); } +TEST_F(ModulesTest, TripletMarginWithDistanceLossDefaultParity) { + // Check that if we use torch::pairwise_distance with the default + // TripletMarginLoss options as our distance function, the outputs + // are equal (i.e., equal under defaults). + + std::vector + reductions = {torch::kSum, torch::kMean, torch::kNone}; + std::vector margins = {0.5, 1.0, 1.5}; + std::vector swaps = {true, false}; + + for (auto& reduction : reductions) { + for (auto& margin : margins) { + for (const auto& swap : swaps) { + auto anchor = + torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + auto positive = + torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + auto negative = + torch::randn({100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + + auto basicOptions = TripletMarginLossOptions() + .reduction(reduction) + .margin(margin) + .swap(swap); + auto distanceOptions = + TripletMarginWithDistanceLossOptions() + .reduction(reduction) + .margin(margin) + .swap(swap); + TripletMarginLoss basicLoss(basicOptions); + TripletMarginWithDistanceLoss distanceLoss(distanceOptions); + + auto basicOutput = basicLoss->forward(anchor, positive, negative); + auto distanceOutput = distanceLoss->forward(anchor, positive, negative); + auto basicOperatorOutput = basicLoss(anchor, positive, negative); + auto distanceOperatorOutput = distanceLoss(anchor, positive, negative); + + ASSERT_TRUE(distanceOutput.allclose(basicOutput, 1e-6, 1e-6)); + ASSERT_TRUE(distanceOperatorOutput.allclose(distanceOutput, 1e-6, 1e-6)); + ASSERT_TRUE(distanceOperatorOutput.allclose(basicOperatorOutput, 1e-6, 1e-6)); + + // handle for torch::kNone reduction + auto sum = distanceOutput.sum(); + sum.backward(); + ASSERT_EQ(anchor.sizes(), anchor.grad().sizes()); + ASSERT_EQ(positive.sizes(), positive.grad().sizes()); + ASSERT_EQ(negative.sizes(), negative.grad().sizes()); + } + } + } +} + +TEST_F(ModulesTest, TripletMarginWithDistanceLossFunctionalParity) { + // Check for parity between F::triplet_margin_with_distance_loss and + // TripletMarginWithDistanceLoss. + auto pairwise_distance = [&](const torch::Tensor& x, const torch::Tensor& y) { + return torch::pairwise_distance(x, y); + }; + auto cosine_distance = [&](const torch::Tensor& x, + const torch::Tensor& y) { + return 1.0 - torch::cosine_similarity(x, y); + }; + std::vector + distance_functions = {pairwise_distance, cosine_distance}; + + std::vector + reductions = {torch::kSum, torch::kMean, torch::kNone}; + std::vector margins = {0.5, 1.0, 1.5}; + std::vector swaps = {true, false}; + + for (auto& function : distance_functions) { + for (auto& reduction : reductions) { + for (auto& margin : margins) { + for (const auto& swap : swaps) { + auto moduleOptions = + TripletMarginWithDistanceLossOptions() + .distance_function(function) + .reduction(reduction) + .margin(margin) + .swap(swap); + auto functionOptions = + torch::nn::functional::TripletMarginWithDistanceLossFuncOptions() + .distance_function(function) + .reduction(reduction) + .margin(margin) + .swap(swap); + + auto anchor = torch::randn( + {100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + auto positive = torch::randn( + {100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + auto negative = torch::randn( + {100, 128}, torch::dtype(torch::kFloat).requires_grad(true)); + + TripletMarginWithDistanceLoss distanceLoss(moduleOptions); + + auto moduleOutput = distanceLoss->forward(anchor, positive, negative); + auto moduleOperatorOutput = distanceLoss(anchor, positive, negative); + auto functionOutput = torch::nn::functional::triplet_margin_with_distance_loss( + anchor, positive, negative, functionOptions); + + ASSERT_TRUE(moduleOutput.allclose(functionOutput, 1e-6, 1e-6)); + ASSERT_TRUE(moduleOperatorOutput.allclose(functionOutput, 1e-6, 1e-6)); + } + } + } + } +} + TEST_F(ModulesTest, NLLLoss) { NLLLoss loss; auto input = torch::tensor({{-0.1315, -3.1315, -2.5315}, @@ -3529,9 +3638,9 @@ TEST_F(ModulesTest, PrettyPrintIdentity) { } TEST_F(ModulesTest, PrettyPrintFlatten) { - ASSERT_EQ(c10::str(Flatten()), + ASSERT_EQ(c10::str(Flatten()), "torch::nn::Flatten(start_dim=1, end_dim=-1)"); - ASSERT_EQ(c10::str(Flatten(FlattenOptions().start_dim(2).end_dim(4))), + ASSERT_EQ(c10::str(Flatten(FlattenOptions().start_dim(2).end_dim(4))), "torch::nn::Flatten(start_dim=2, end_dim=4)"); } @@ -4394,6 +4503,20 @@ TEST_F(ModulesTest, PrettyPrintTripletMarginLoss) { "torch::nn::TripletMarginLoss(margin=3, p=2, eps=1e-06, swap=false)"); } +TEST_F(ModulesTest, PrettyPrintTripletMarginWithDistanceLoss) { + auto distanceOptions = TripletMarginWithDistanceLossOptions() + .distance_function([&](const torch::Tensor& x, + const torch::Tensor& y) { + return torch::pairwise_distance(x, y, 2.0, 1e-6); + }) + .margin(1.5) + .swap(true) + .reduction(torch::kMean); + ASSERT_EQ( + c10::str(TripletMarginWithDistanceLoss(distanceOptions)), + "torch::nn::TripletMarginWithDistanceLoss(margin=1.5, swap=true)"); +} + TEST_F(ModulesTest, PrettyPrintNLLLoss) { ASSERT_EQ( c10::str(NLLLoss()), "torch::nn::NLLLoss()"); diff --git a/test/cpp/dist_autograd/CMakeLists.txt b/test/cpp/dist_autograd/CMakeLists.txt index 5d23602881f0..9969c63e16d5 100644 --- a/test/cpp/dist_autograd/CMakeLists.txt +++ b/test/cpp/dist_autograd/CMakeLists.txt @@ -1,4 +1,4 @@ -if(USE_DISTRIBUTED) +if(USE_DISTRIBUTED AND NOT WIN32) set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd") set(DIST_AUTOGRAD_TEST_SOURCES ${TORCH_ROOT}/test/cpp/common/main.cpp diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index 84f7193ad8c0..2e22cd646813 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -2,7 +2,10 @@ set(JIT_TEST_ROOT ${TORCH_ROOT}/test/cpp/jit) # Build separate libraries the define custom classes/operators used from our Python tests. # These are intended to be used with torch.ops.load_library() in our Python test suite. -add_library(torchbind_test SHARED ${JIT_TEST_ROOT}/test_custom_class.cpp) +add_library(torchbind_test SHARED + ${JIT_TEST_ROOT}/test_custom_class_registrations.h + ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp +) target_link_libraries(torchbind_test torch) add_library(jitbackend_test SHARED ${JIT_TEST_ROOT}/test_backend.cpp) @@ -16,12 +19,9 @@ endif() # Build the cpp gtest binary containing the cpp-only tests. set(JIT_TEST_SRCS - ${JIT_TEST_ROOT}/gtest.cpp ${JIT_TEST_ROOT}/test_alias_analysis.cpp ${JIT_TEST_ROOT}/test_argument_spec.cpp ${JIT_TEST_ROOT}/test_autodiff.cpp - ${JIT_TEST_ROOT}/test_base.cpp - ${JIT_TEST_ROOT}/test_base.h ${JIT_TEST_ROOT}/test_class_import.cpp ${JIT_TEST_ROOT}/test_class_parser.cpp ${JIT_TEST_ROOT}/test_class_type.cpp @@ -30,6 +30,8 @@ set(JIT_TEST_SRCS ${JIT_TEST_ROOT}/test_cleanup_passes.cpp ${JIT_TEST_ROOT}/test_create_autodiff_subgraphs.cpp ${JIT_TEST_ROOT}/test_custom_class.cpp + ${JIT_TEST_ROOT}/test_custom_class_registrations.h + ${JIT_TEST_ROOT}/test_custom_class_registrations.cpp ${JIT_TEST_ROOT}/test_custom_operators.cpp ${JIT_TEST_ROOT}/test_dce.cpp ${JIT_TEST_ROOT}/test_fuser.cpp @@ -95,8 +97,6 @@ elseif(USE_ROCM) ${PYTORCH_HIP_HCC_LIBRARIES} ${TORCH_CUDA_LIBRARIES}) - target_link_libraries(test_jit PRIVATE caffe2_gpu) - target_compile_definitions(test_jit PRIVATE USE_ROCM) endif() diff --git a/test/cpp/jit/README.md b/test/cpp/jit/README.md index a3e92403201f..ef5ea2d910be 100644 --- a/test/cpp/jit/README.md +++ b/test/cpp/jit/README.md @@ -1,69 +1,44 @@ # JIT C++ Tests -## How to add a new test +## Adding a new test First, create a new test file. Test files should have be placed in this directory, with a name that starts with `test_`, like `test_foo.cpp`. -Here is an example test file you can copy-paste. +In general a single test suite + +Add your test file to the `JIT_TEST_SRCS` list in `test/cpp/jit/CMakeLists.txt`. + +A test file may look like: ```cpp -#include +#include -// Tests go in torch::jit -namespace torch { -namespace jit { +using namespace ::torch::jit -// 1. Test cases are void() functions. -// 2. They start with the prefix `test` -void testCaseOne() { - // ... +TEST(FooTest, BarBaz) { + // ... } -void testCaseTwo() { - // ... -} -} +// Append '_CUDA' to the test case name will automatically filter it out if CUDA +// is not compiled. +TEST(FooTest, NeedsAGpu_CUDA) { + // ... } -``` -Then, register your test in `tests.h`: -```cpp -// Add to TH_FORALL_TESTS_CUDA instead for CUDA-requiring tests -#define TH_FORALL_TESTS(_) \ - _(ADFormulas) \ - _(Attributes) \ - ... - _(CaseOne) // note that the `test` prefix is omitted. - _(CaseTwo) -``` - -We glob all the test files together in `CMakeLists.txt` so that you don't -have to edit it every time you add a test. Unfortunately, this means that in -order to get the build to pick up your new test file, you need to re-run -cmake: -``` -python setup.py build --cmake +// Similarly, if only one GPU is detected, tests with `_MultiCUDA` at the end +// will not be run. +TEST(FooTest, NeedsMultipleGpus_MultiCUDA) { + // ... +} ``` -## Why do we have two different test runners? -We have two different ways of running our cpp tests: -1. With `gtest`, from a standalone binary. -2. With Python, from `TestJit.test_cpp` and `TestJit.test_cpp_cuda` (in - `test/test_jit.py`) - -We want both because we need to test things from a pure-C++ environment and -with all our various Python patch-points enabled. - -## How do I run the tests? +## Building and running the tests The following commands assume you are in PyTorch root. -1. With `gtest`: - ```bash - # (re)build the test binary - ninja build/bin/test_jit - # run - build/bin/test_jit --gtest_filter='glob_style_filter*' - ``` -2. With Python: - ``` - python test/test_jit.py TestJit.test_cpp TestJit.test_cpp_cuda - ``` +```bash +# ... Build PyTorch from source, e.g. +python setup.py develop +# (re)build just the binary +ninja -C build bin/test_jit +# run tests +build/bin/test_jit --gtest_filter='glob_style_filter*' +``` diff --git a/test/cpp/jit/gtest.cpp b/test/cpp/jit/gtest.cpp deleted file mode 100644 index e0e512be4352..000000000000 --- a/test/cpp/jit/gtest.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include - -#include - -namespace torch { -namespace jit { - -#define JIT_GTEST(name) \ - TEST(JitTest, name) { \ - test##name(); \ - } -TH_FORALL_TESTS(JIT_GTEST) -#undef JIT_TEST - -#define JIT_GTEST_CUDA(name) \ - TEST(JitTest, name##_CUDA) { \ - test##name(); \ - } -TH_FORALL_TESTS_CUDA(JIT_GTEST_CUDA) -#undef JIT_TEST_CUDA - -} // namespace jit -} // namespace torch diff --git a/test/cpp/jit/test_alias_analysis.cpp b/test/cpp/jit/test_alias_analysis.cpp index e854113a7a87..e700ee540616 100644 --- a/test/cpp/jit/test_alias_analysis.cpp +++ b/test/cpp/jit/test_alias_analysis.cpp @@ -1238,6 +1238,32 @@ TEST(AliasRegistrationTest, PureWithAnnotationsShouldError) { "Tried to register operator foo::rand11(Tensor(a) arg1) -> (Tensor(a)) with aliasing information in the schema but without AliasAnalysisKind::FROM_SCHEMA"); } +TEST(AliasRegistrationTest, AliasMoveAtenListOp) { + auto graph = std::make_shared(); + std::unordered_map vmap; + auto graph_string = R"IR( + graph(): + %x : Tensor = prim::MakeTestTensor() + %8 : int = prim::Constant[value=0]() + %5 : int = prim::Constant[value=1]() + %4 : int = prim::Constant[value=2]() + %y : Tensor[] = prim::ListConstruct(%x) + %6 : Tensor = aten::add_(%x, %4, %5) + %9 : Tensor = aten::cat(%y, %8) + return (%9))IR"; + + torch::jit::parseIR(graph_string, graph.get(), vmap); + AliasDb aliasDb(graph); + + // bc y.1 has a single used in a single non-aliasing aten op, + // x is added to y.1 contained elements instead of wildcard set + EXPECT_TRUE(!aliasDb.mayAlias(vmap["x"], vmap["9"])); + + // write to contained element should prevent move + EXPECT_TRUE(!aliasDb.moveBeforeTopologicallyValid( + vmap["y"]->node(), vmap["9"]->node())); +} + TEST(AliasRegistrationTest, PureWithAnnotationsShouldError2) { auto registry = torch::RegisterOperators().op( "foo::rand12(Tensor(a) arg1) -> Tensor(b)", diff --git a/test/cpp/jit/test_argument_spec.cpp b/test/cpp/jit/test_argument_spec.cpp index 01e27caac05f..bf40761fc468 100644 --- a/test/cpp/jit/test_argument_spec.cpp +++ b/test/cpp/jit/test_argument_spec.cpp @@ -1,3 +1,5 @@ +#include + #include #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/runtime/argument_spec.h" @@ -5,6 +7,8 @@ namespace torch { namespace jit { +namespace { + int device(const autograd::Variable& v) { return v.device().is_cuda() ? v.get_device() : -1; } @@ -38,8 +42,9 @@ autograd::Variable var( autograd::Variable undef() { return autograd::Variable(); } +} // namespace -void testCompleteArgumentSpec() { +TEST(ArgumentSpecTest, CompleteArgumentSpec_CUDA) { auto const CF = at::CPU(at::kFloat); auto const CD = at::CPU(at::kDouble); auto const GF = at::CUDA(at::kFloat); @@ -94,34 +99,35 @@ void testCompleteArgumentSpec() { ASSERT_EQ(with_const.at(2).sizes().size(), 2); } -size_t hashCode(const TensorTypePtr& ptr) { - return std::hash()(*ptr.get()); -} +// TODO: this test was disabled for unknown reasons and doesn't run. +// static size_t hashCode(const TensorTypePtr& ptr) { +// return std::hash()(*ptr.get()); +// } -void testProfiledTensorTypeHashing() { - c10::VaryingShape vs(c10::optional{}); - auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false); - auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false); - ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2)); +// TEST(ArgumentSpecTest, VaryingShape) { +// c10::VaryingShape vs(c10::optional{}); +// auto ptt_empty1 = TensorType::create({}, {}, vs, vs, false); +// auto ptt_empty2 = TensorType::create({}, {}, vs, vs, false); +// ASSERT_EQ(hashCode(ptt_empty1), hashCode(ptt_empty2)); - c10::VaryingShape vs22(std::vector{2, 2}); - auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false); - auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false); - ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2)); +// c10::VaryingShape vs22(std::vector{2, 2}); +// auto ptt_vs22_vs22_1 = TensorType::create({}, {}, vs22, vs22, false); +// auto ptt_vs22_vs22_2 = TensorType::create({}, {}, vs22, vs22, false); +// ASSERT_EQ(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs22_2)); - c10::VaryingShape vs23(std::vector{2, 3}); - auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false); - ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2)); +// c10::VaryingShape vs23(std::vector{2, 3}); +// auto ptt_vs22_vs23_2 = TensorType::create({}, {}, vs22, vs23, false); +// ASSERT_NE(hashCode(ptt_vs22_vs22_1), hashCode(ptt_vs22_vs23_2)); - auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true); - auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true); - ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true)); +// auto ptt_vs22_vs22_1_true = TensorType::create({}, {}, vs22, vs22, true); +// auto ptt_vs22_vs22_2_true = TensorType::create({}, {}, vs22, vs22, true); +// ASSERT_EQ(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_2_true)); - auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false); - ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false)); -} +// auto ptt_vs22_vs22_1_false = TensorType::create({}, {}, vs22, vs22, false); +// ASSERT_NE(hashCode(ptt_vs22_vs22_1_true), hashCode(ptt_vs22_vs22_1_false)); +// } -void testArgumentSpec() { +TEST(ArgumentSpecTest, Basic_CUDA) { auto& CF = at::CPU(at::kFloat); auto& CD = at::CPU(at::kDouble); auto& GF = at::CUDA(at::kFloat); diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp index 7d431776a971..3993c63b1708 100644 --- a/test/cpp/jit/test_autodiff.cpp +++ b/test/cpp/jit/test_autodiff.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/frontend/tracer.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" @@ -83,7 +84,7 @@ variable_list grad( fmap(inputs, get_edge)); } -void testADFormulas() { +TEST(AutodiffTest, ADFormulas) { const auto cast = [](const Variable& v) { return static_cast(v); }; @@ -174,7 +175,7 @@ void testADFormulas() { } } -void testDifferentiate() { +TEST(AutodiffTest, Differentiate) { // Note: can't use IRParser for this test due to issue #23989 auto graph = std::make_shared(); std::vector sizes{2, 3, 4}; @@ -229,7 +230,7 @@ void testDifferentiate() { ->run(*grad_spec.df); } -void testDifferentiateWithRequiresGrad() { +TEST(AutodiffTest, DifferentiateWithRequiresGrad) { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): diff --git a/test/cpp/jit/test_base.cpp b/test/cpp/jit/test_base.cpp deleted file mode 100644 index 338577fbd833..000000000000 --- a/test/cpp/jit/test_base.cpp +++ /dev/null @@ -1,26 +0,0 @@ -#include -#include - -#include "torch/csrc/jit/runtime/custom_operator.h" - -namespace torch { -namespace jit { -inline c10::AliasAnalysisKind aliasAnalysisFromSchema() { - return c10::AliasAnalysisKind::FROM_SCHEMA; -} - -namespace { -RegisterOperators reg({ - // This operator is intended to be used in JIT analysis and transformation - // pass unit tests in which Values with type Tensor are often required. It - // should not be used in situations in which the graph is actually executed - // because it always produces empty Tensors. - Operator( - "prim::MakeTestTensor() -> Tensor", - [](Stack* stack) { push(stack, at::Tensor()); }, - aliasAnalysisFromSchema()), -}); -} - -} // namespace jit -} // namespace torch diff --git a/test/cpp/jit/test_base.h b/test/cpp/jit/test_base.h deleted file mode 100644 index 54a59e445e95..000000000000 --- a/test/cpp/jit/test_base.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -// This file defines assertion macros that work in both gtest and non-gtest -// builds, and has some common includes. -#include "torch/csrc/jit/ir/ir.h" -#include "torch/csrc/jit/runtime/operator.h" - -#if defined(USE_GTEST) -#include -#include -#else -#include "c10/util/Exception.h" -#define ASSERT_EQ(x, y) TORCH_INTERNAL_ASSERT((x) == (y)) -#define ASSERT_NE(x, y) TORCH_INTERNAL_ASSERT((x) != (y)) -#define ASSERT_TRUE TORCH_INTERNAL_ASSERT -#define ASSERT_FALSE(x) ASSERT_TRUE(!(x)) -#define ASSERT_THROWS_WITH(statement, substring) \ - try { \ - (void)statement; \ - ASSERT_TRUE(false); \ - } catch (const std::exception& e) { \ - ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \ - } -#define ASSERT_ANY_THROW(statement) \ - { \ - bool threw = false; \ - try { \ - (void)statement; \ - } catch (const std::exception& e) { \ - threw = true; \ - } \ - ASSERT_TRUE(threw); \ - } - -#endif // defined(USE_GTEST) - -static inline bool isSandcastle() { - return ( - (std::getenv("SANDCASTLE")) || - (std::getenv("TW_JOB_USER") && - std::string(std::getenv("TW_JOB_USER")) == "sandcastle")); -} diff --git a/test/cpp/jit/test_class_import.cpp b/test/cpp/jit/test_class_import.cpp index 82bc0cf3bccc..ffa845b3e2a8 100644 --- a/test/cpp/jit/test_class_import.cpp +++ b/test/cpp/jit/test_class_import.cpp @@ -1,7 +1,7 @@ -#include -#include +#include #include +#include #include #include #include @@ -45,7 +45,7 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } -void testClassImport() { +TEST(ClassImportTest, Basic) { auto cu1 = std::make_shared(); auto cu2 = std::make_shared(); std::vector constantTable; @@ -80,7 +80,7 @@ void testClassImport() { ASSERT_FALSE(c); } -void testScriptObject() { +TEST(ClassImportTest, ScriptObject) { Module m1("m1"); Module m2("m2"); std::vector constantTable; @@ -114,7 +114,7 @@ def __init__(self, x): return x )JIT"; -void testClassDerive() { +TEST(ClassImportTest, ClassDerive) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu); const auto self = SimpleSelf(cls); @@ -142,7 +142,7 @@ class FooBar1234(Module): return (self.f).top() )JIT"; -void testSaveLoadTorchbind() { +TEST(ClassImportTest, CustomClass) { auto cu1 = std::make_shared(); std::vector constantTable; // Import different versions of FooTest into two namespaces. diff --git a/test/cpp/jit/test_class_parser.cpp b/test/cpp/jit/test_class_parser.cpp index 45e37103bb5a..2f7f06d3802b 100644 --- a/test/cpp/jit/test_class_parser.cpp +++ b/test/cpp/jit/test_class_parser.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include @@ -15,7 +16,7 @@ const auto testSource = R"JIT( an_attribute : Tensor )JIT"; -void testClassParser() { +TEST(ClassParserTest, Basic) { Parser p(std::make_shared(testSource)); std::vector definitions; std::vector resolvers; diff --git a/test/cpp/jit/test_class_type.cpp b/test/cpp/jit/test_class_type.cpp index c00aafcc526b..21229594d56d 100644 --- a/test/cpp/jit/test_class_type.cpp +++ b/test/cpp/jit/test_class_type.cpp @@ -1,11 +1,12 @@ -#include +#include + #include #include namespace torch { namespace jit { -void testClassTypeAddRemoveAttr() { +TEST(ClassTypeTest, AddRemoveAttr) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); cls->addAttribute("attr1", TensorType::get(), true); @@ -32,12 +33,12 @@ void testClassTypeAddRemoveAttr() { cls->addAttribute("attr1", IntType::get()); } -void testClassTypeAddRemoveConstant() { +TEST(ClassTypeTest, AddRemoveConstant) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu); cls->addConstant("const1", IValue(1)); cls->addConstant("const2", IValue(2)); - cls->addConstant("const3", IValue(2)); + cls->addConstant("const3", IValue(3)); ASSERT_EQ(cls->numConstants(), 3); ASSERT_TRUE(cls->hasConstant("const1")); ASSERT_TRUE(cls->hasConstant("const2")); @@ -46,7 +47,7 @@ void testClassTypeAddRemoveConstant() { ASSERT_EQ(cls->getConstant("const1").toInt(), 1); ASSERT_EQ(cls->getConstant("const2").toInt(), 2); - ASSERT_EQ(cls->getConstant("const2").toInt(), 3); + ASSERT_EQ(cls->getConstant("const3").toInt(), 3); cls->unsafeRemoveConstant("const2"); ASSERT_TRUE(cls->hasConstant("const1")); diff --git a/test/cpp/jit/test_cleanup_passes.cpp b/test/cpp/jit/test_cleanup_passes.cpp index 2f2ca4e0a19b..38ceef932eb0 100644 --- a/test/cpp/jit/test_cleanup_passes.cpp +++ b/test/cpp/jit/test_cleanup_passes.cpp @@ -1,19 +1,19 @@ +#include + #include #include #include #include -#include "test/cpp/jit/test_base.h" namespace torch { namespace jit { -void testCleanUpPasses() { +TEST(CleanupPassTest, Basic) { // Tests stability of clean up passes when dealing with constant pooling // and constant propagation. - { - auto graph = std::make_shared(); - parseIR( - R"IR( + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond.1 : Tensor, %suffix.1 : str): %3 : bool = aten::Bool(%cond.1) # o.py:6:7 @@ -31,20 +31,19 @@ graph(%cond.1 : Tensor, -> (%12) return (%25) )IR", - &*graph); - runCleanupPasses(graph); - testing::FileCheck() - .check_count( - "prim::Constant[value=\"same string with a twist\"]", - 1, - /*exactly=*/true) - ->run(*graph); + &*graph); + runCleanupPasses(graph); + testing::FileCheck() + .check_count( + "prim::Constant[value=\"same string with a twist\"]", + 1, + /*exactly=*/true) + ->run(*graph); - auto graph_after_pass_once = graph->toString(); - runCleanupPasses(graph); - auto graph_after_pass_twice = graph->toString(); - ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); - } + auto graph_after_pass_once = graph->toString(); + runCleanupPasses(graph); + auto graph_after_pass_twice = graph->toString(); + ASSERT_EQ(graph_after_pass_once, graph_after_pass_twice); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_code_template.cpp b/test/cpp/jit/test_code_template.cpp index e4d7d1ef856e..bf539e3d169f 100644 --- a/test/cpp/jit/test_code_template.cpp +++ b/test/cpp/jit/test_code_template.cpp @@ -1,6 +1,6 @@ -#include "test/cpp/jit/test_base.h" -#include "test/cpp/jit/test_utils.h" +#include +#include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/frontend/code_template.h" namespace torch { @@ -33,31 +33,29 @@ static const auto ct_expect = R"( int notest(int a) )"; -void testCodeTemplate() { - { - TemplateEnv e; - e.s("hi", "foo"); - e.v("what", {"is", "this"}); - TemplateEnv c(e); - c.s("hi", "foo2"); - ASSERT_EQ(e.s("hi"), "foo"); - ASSERT_EQ(c.s("hi"), "foo2"); - ASSERT_EQ(e.v("what")[0], "is"); - } +TEST(TestCodeTemplate, Copying) { + TemplateEnv e; + e.s("hi", "foo"); + e.v("what", {"is", "this"}); + TemplateEnv c(e); + c.s("hi", "foo2"); + ASSERT_EQ(e.s("hi"), "foo"); + ASSERT_EQ(c.s("hi"), "foo2"); + ASSERT_EQ(e.v("what")[0], "is"); +} - { - TemplateEnv e; - e.v("args", {"hi", "8"}); - e.v("bar", {"what\non many\nlines...", "7"}); - e.s("a", "3"); - e.s("b", "4"); - e.v("stuff", {"things...", "others"}); - e.v("empty", {}); - auto s = ct.format(e); - // std::cout << "'" << s << "'\n"; - // std::cout << "'" << ct_expect << "'\n"; - ASSERT_EQ(s, ct_expect); - } +TEST(TestCodeTemplate, Formatting) { + TemplateEnv e; + e.v("args", {"hi", "8"}); + e.v("bar", {"what\non many\nlines...", "7"}); + e.s("a", "3"); + e.s("b", "4"); + e.v("stuff", {"things...", "others"}); + e.v("empty", {}); + auto s = ct.format(e); + // std::cout << "'" << s << "'\n"; + // std::cout << "'" << ct_expect << "'\n"; + ASSERT_EQ(s, ct_expect); } } // namespace jit diff --git a/test/cpp/jit/test_constant_pooling.cpp b/test/cpp/jit/test_constant_pooling.cpp index b949c9a45b25..c8cb58e1886a 100644 --- a/test/cpp/jit/test_constant_pooling.cpp +++ b/test/cpp/jit/test_constant_pooling.cpp @@ -1,9 +1,10 @@ +#include + #include #include #include #include #include -#include "test/cpp/jit/test_base.h" #include #include @@ -11,26 +12,26 @@ namespace torch { namespace jit { -void testConstantPooling() { - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(ConstantPoolingTest, Int) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %8 : int = prim::Constant[value=1]() %10 : int = prim::Constant[value=1]() return (%8, %10) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant", 1, /*exactly*/ true) - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant", 1, /*exactly*/ true) + ->run(*graph); +} + +TEST(ConstantPoolingTest, PoolingAcrossBlocks) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%cond : Tensor): %a : str = prim::Constant[value="bcd"]() %3 : bool = aten::Bool(%cond) @@ -44,17 +45,18 @@ graph(%cond : Tensor): %7 : (str, str) = prim::TupleConstruct(%a, %b) return (%7) )IR", - &*graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) - ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count("prim::Constant[value=\"abc\"]", 1, /*exactly*/ true) + ->check_count("prim::Constant[value=\"bcd\"]", 1, /*exactly*/ true) + ->run(*graph); +} + +TEST(ConstantPoolingTest, PoolingDifferentDevices) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %2 : int = prim::Constant[value=2]() %1 : int = prim::Constant[value=1]() @@ -70,22 +72,21 @@ graph(): prim::Print(%x, %y, %z) return (%1) )IR", - &*graph); - // three tensors created - two different devices among the three - // don't have good support for parsing tensor constants - ConstantPropagation(graph); - ConstantPooling(graph); - testing::FileCheck() - .check_count( - "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->check_count( - "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", - 1, - /*exactly*/ true) - ->run(*graph); - } + &*graph); + // three tensors created - two different devices among the three + // don't have good support for parsing tensor constants + ConstantPropagation(graph); + ConstantPooling(graph); + testing::FileCheck() + .check_count( + "Float(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->check_count( + "Long(2:1, requires_grad=0, device=cpu) = prim::Constant", + 1, + /*exactly*/ true) + ->run(*graph); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_create_autodiff_subgraphs.cpp b/test/cpp/jit/test_create_autodiff_subgraphs.cpp index 8da6d9d6a1b2..e97043f84d24 100644 --- a/test/cpp/jit/test_create_autodiff_subgraphs.cpp +++ b/test/cpp/jit/test_create_autodiff_subgraphs.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/passes/create_autodiff_subgraphs.h" @@ -6,7 +7,7 @@ namespace torch { namespace jit { -void testCreateAutodiffSubgraphs() { +TEST(CreateAutodiffSubgraphsTest, Basic) { auto graph = build_lstm(); CreateAutodiffSubgraphs(graph, /*threshold=*/2); // all of the ops are within the DifferentiableGraph diff --git a/test/cpp/jit/test_custom_class.cpp b/test/cpp/jit/test_custom_class.cpp index 543fbc20eb3d..a96a3b4a5635 100644 --- a/test/cpp/jit/test_custom_class.cpp +++ b/test/cpp/jit/test_custom_class.cpp @@ -1,3 +1,6 @@ +#include + +#include #include #include @@ -8,317 +11,7 @@ namespace torch { namespace jit { -namespace { - -struct Foo : torch::CustomClassHolder { - int x, y; - Foo() : x(0), y(0) {} - Foo(int x_, int y_) : x(x_), y(y_) {} - int64_t info() { - return this->x * this->y; - } - int64_t add(int64_t z) { - return (x + y) * z; - } - void increment(int64_t z) { - this->x += z; - this->y += z; - } - int64_t combine(c10::intrusive_ptr b) { - return this->info() + b->info(); - } - ~Foo() { - // std::cout<<"Destroying object with values: "< -struct MyStackClass : torch::CustomClassHolder { - std::vector stack_; - MyStackClass(std::vector init) : stack_(init.begin(), init.end()) {} - - void push(T x) { - stack_.push_back(x); - } - T pop() { - auto val = stack_.back(); - stack_.pop_back(); - return val; - } - - c10::intrusive_ptr clone() const { - return c10::make_intrusive(stack_); - } - - void merge(const c10::intrusive_ptr& c) { - for (auto& elem : c->stack_) { - push(elem); - } - } - - std::tuple return_a_tuple() const { - return std::make_tuple(1337.0f, 123); - } -}; - -struct PickleTester : torch::CustomClassHolder { - PickleTester(std::vector vals) : vals(std::move(vals)) {} - std::vector vals; -}; - -at::Tensor take_an_instance(const c10::intrusive_ptr& instance) { - return torch::zeros({instance->vals.back(), 4}); -} - -struct ElementwiseInterpreter : torch::CustomClassHolder { - using InstructionType = std::tuple< - std::string /*op*/, - std::vector /*inputs*/, - std::string /*output*/>; - - ElementwiseInterpreter() {} - - // Load a list of instructions into the interpreter. As specified above, - // instructions specify the operation (currently support "add" and "mul"), - // the names of the input values, and the name of the single output value - // from this instruction - void setInstructions(std::vector instructions) { - instructions_ = std::move(instructions); - } - - // Add a constant. The interpreter maintains a set of constants across - // calls. They are keyed by name, and constants can be referenced in - // Instructions by the name specified - void addConstant(const std::string& name, at::Tensor value) { - constants_.insert_or_assign(name, std::move(value)); - } - - // Set the string names for the positional inputs to the function this - // interpreter represents. When invoked, the interpreter will assign - // the positional inputs to the names in the corresponding position in - // input_names. - void setInputNames(std::vector input_names) { - input_names_ = std::move(input_names); - } - - // Specify the output name for the function this interpreter represents. This - // should match the "output" field of one of the instructions in the - // instruction list, typically the last instruction. - void setOutputName(std::string output_name) { - output_name_ = std::move(output_name); - } - - // Invoke this interpreter. This takes a list of positional inputs and returns - // a single output. Currently, inputs and outputs must all be Tensors. - at::Tensor __call__(std::vector inputs) { - // Environment to hold local variables - std::unordered_map environment; - - // Load inputs according to the specified names - if (inputs.size() != input_names_.size()) { - std::stringstream err; - err << "Expected " << input_names_.size() << " inputs, but got " - << inputs.size() << "!"; - throw std::runtime_error(err.str()); - } - for (size_t i = 0; i < inputs.size(); ++i) { - environment[input_names_[i]] = inputs[i]; - } - - for (InstructionType& instr : instructions_) { - // Retrieve all input values for this op - std::vector inputs; - for (const auto& input_name : std::get<1>(instr)) { - // Operator output values shadow constants. - // Imagine all constants are defined in statements at the beginning - // of a function (a la K&R C). Any definition of an output value must - // necessarily come after constant definition in textual order. Thus, - // We look up values in the environment first then the constant table - // second to implement this shadowing behavior - if (environment.find(input_name) != environment.end()) { - inputs.push_back(environment.at(input_name)); - } else if (constants_.find(input_name) != constants_.end()) { - inputs.push_back(constants_.at(input_name)); - } else { - std::stringstream err; - err << "Instruction referenced unknown value " << input_name << "!"; - throw std::runtime_error(err.str()); - } - } - - // Run the specified operation - at::Tensor result; - const auto& op = std::get<0>(instr); - if (op == "add") { - if (inputs.size() != 2) { - throw std::runtime_error("Unexpected number of inputs for add op!"); - } - result = inputs[0] + inputs[1]; - } else if (op == "mul") { - if (inputs.size() != 2) { - throw std::runtime_error("Unexpected number of inputs for mul op!"); - } - result = inputs[0] * inputs[1]; - } else { - std::stringstream err; - err << "Unknown operator " << op << "!"; - throw std::runtime_error(err.str()); - } - - // Write back result into environment - const auto& output_name = std::get<2>(instr); - environment[output_name] = std::move(result); - } - - if (!output_name_) { - throw std::runtime_error("Output name not specififed!"); - } - - return environment.at(*output_name_); - } - - // Ser/De infrastructure. See - // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes - // for more info. - - // This is the type we will use to marshall information on disk during - // ser/de. It is a simple tuple composed of primitive types and simple - // collection types like vector, optional, and dict. - using SerializationType = std::tuple< - std::vector /*input_names_*/, - c10::optional /*output_name_*/, - c10::Dict /*constants_*/, - std::vector /*instructions_*/ - >; - - // This function yields the SerializationType instance for `this`. - SerializationType __getstate__() const { - return SerializationType{ - input_names_, output_name_, constants_, instructions_}; - } - - // This function will create an instance of `ElementwiseInterpreter` given - // an instance of `SerializationType`. - static c10::intrusive_ptr __setstate__( - SerializationType state) { - auto instance = c10::make_intrusive(); - std::tie( - instance->input_names_, - instance->output_name_, - instance->constants_, - instance->instructions_) = std::move(state); - return instance; - } - - // Class members - std::vector input_names_; - c10::optional output_name_; - c10::Dict constants_; - std::vector instructions_; -}; - -TORCH_LIBRARY(_TorchScriptTesting, m) { - m.class_("_Foo") - .def(torch::init()) - // .def(torch::init<>()) - .def("info", &Foo::info) - .def("increment", &Foo::increment) - .def("add", &Foo::add) - .def("combine", &Foo::combine); - - m.class_("_NoInit").def( - "get_x", [](const c10::intrusive_ptr& self) { return self->x; }); - - m.class_>("_StackString") - .def(torch::init>()) - .def("push", &MyStackClass::push) - .def("pop", &MyStackClass::pop) - .def("clone", &MyStackClass::clone) - .def("merge", &MyStackClass::merge) - .def_pickle( - [](const c10::intrusive_ptr>& self) { - return self->stack_; - }, - [](std::vector state) { // __setstate__ - return c10::make_intrusive>( - std::vector{"i", "was", "deserialized"}); - }) - .def("return_a_tuple", &MyStackClass::return_a_tuple) - .def( - "top", - [](const c10::intrusive_ptr>& self) - -> std::string { return self->stack_.back(); }) - .def( - "__str__", - [](const c10::intrusive_ptr>& self) { - std::stringstream ss; - ss << "["; - for (size_t i = 0; i < self->stack_.size(); ++i) { - ss << self->stack_[i]; - if (i != self->stack_.size() - 1) { - ss << ", "; - } - } - ss << "]"; - return ss.str(); - }); - // clang-format off - // The following will fail with a static assert telling you you have to - // take an intrusive_ptr as the first argument. - // .def("foo", [](int64_t a) -> int64_t{ return 3;}); - // clang-format on - - m.class_("_PickleTester") - .def(torch::init>()) - .def_pickle( - [](c10::intrusive_ptr self) { // __getstate__ - return std::vector{1, 3, 3, 7}; - }, - [](std::vector state) { // __setstate__ - return c10::make_intrusive(std::move(state)); - }) - .def( - "top", - [](const c10::intrusive_ptr& self) { - return self->vals.back(); - }) - .def("pop", [](const c10::intrusive_ptr& self) { - auto val = self->vals.back(); - self->vals.pop_back(); - return val; - }); - - m.def( - "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y", - take_an_instance); - // test that schema inference is ok too - m.def("take_an_instance_inferred", take_an_instance); - - m.class_("_ElementwiseInterpreter") - .def(torch::init<>()) - .def("set_instructions", &ElementwiseInterpreter::setInstructions) - .def("add_constant", &ElementwiseInterpreter::addConstant) - .def("set_input_names", &ElementwiseInterpreter::setInputNames) - .def("set_output_name", &ElementwiseInterpreter::setOutputName) - .def("__call__", &ElementwiseInterpreter::__call__) - .def_pickle( - /* __getstate__ */ - [](const c10::intrusive_ptr& self) { - return self->__getstate__(); - }, - /* __setstate__ */ - [](ElementwiseInterpreter::SerializationType state) { - return ElementwiseInterpreter::__setstate__(std::move(state)); - }); -} - -} // namespace - -void testTorchbindIValueAPI() { +TEST(CustomClassTest, TorchbindIValueAPI) { script::Module m("m"); // test make_custom_class API diff --git a/test/cpp/jit/test_custom_class_registrations.cpp b/test/cpp/jit/test_custom_class_registrations.cpp new file mode 100644 index 000000000000..f563120bbc6c --- /dev/null +++ b/test/cpp/jit/test_custom_class_registrations.cpp @@ -0,0 +1,291 @@ +#include + +#include +#include + +#include +#include +#include + +using namespace torch::jit; + +namespace { + +struct Foo : torch::CustomClassHolder { + int x, y; + Foo() : x(0), y(0) {} + Foo(int x_, int y_) : x(x_), y(y_) {} + int64_t info() { + return this->x * this->y; + } + int64_t add(int64_t z) { + return (x + y) * z; + } + void increment(int64_t z) { + this->x += z; + this->y += z; + } + int64_t combine(c10::intrusive_ptr b) { + return this->info() + b->info(); + } + ~Foo() { + // std::cout<<"Destroying object with values: "< vals) : vals(std::move(vals)) {} + std::vector vals; +}; + +at::Tensor take_an_instance(const c10::intrusive_ptr& instance) { + return torch::zeros({instance->vals.back(), 4}); +} + +struct ElementwiseInterpreter : torch::CustomClassHolder { + using InstructionType = std::tuple< + std::string /*op*/, + std::vector /*inputs*/, + std::string /*output*/>; + + ElementwiseInterpreter() {} + + // Load a list of instructions into the interpreter. As specified above, + // instructions specify the operation (currently support "add" and "mul"), + // the names of the input values, and the name of the single output value + // from this instruction + void setInstructions(std::vector instructions) { + instructions_ = std::move(instructions); + } + + // Add a constant. The interpreter maintains a set of constants across + // calls. They are keyed by name, and constants can be referenced in + // Instructions by the name specified + void addConstant(const std::string& name, at::Tensor value) { + constants_.insert_or_assign(name, std::move(value)); + } + + // Set the string names for the positional inputs to the function this + // interpreter represents. When invoked, the interpreter will assign + // the positional inputs to the names in the corresponding position in + // input_names. + void setInputNames(std::vector input_names) { + input_names_ = std::move(input_names); + } + + // Specify the output name for the function this interpreter represents. This + // should match the "output" field of one of the instructions in the + // instruction list, typically the last instruction. + void setOutputName(std::string output_name) { + output_name_ = std::move(output_name); + } + + // Invoke this interpreter. This takes a list of positional inputs and returns + // a single output. Currently, inputs and outputs must all be Tensors. + at::Tensor __call__(std::vector inputs) { + // Environment to hold local variables + std::unordered_map environment; + + // Load inputs according to the specified names + if (inputs.size() != input_names_.size()) { + std::stringstream err; + err << "Expected " << input_names_.size() << " inputs, but got " + << inputs.size() << "!"; + throw std::runtime_error(err.str()); + } + for (size_t i = 0; i < inputs.size(); ++i) { + environment[input_names_[i]] = inputs[i]; + } + + for (InstructionType& instr : instructions_) { + // Retrieve all input values for this op + std::vector inputs; + for (const auto& input_name : std::get<1>(instr)) { + // Operator output values shadow constants. + // Imagine all constants are defined in statements at the beginning + // of a function (a la K&R C). Any definition of an output value must + // necessarily come after constant definition in textual order. Thus, + // We look up values in the environment first then the constant table + // second to implement this shadowing behavior + if (environment.find(input_name) != environment.end()) { + inputs.push_back(environment.at(input_name)); + } else if (constants_.find(input_name) != constants_.end()) { + inputs.push_back(constants_.at(input_name)); + } else { + std::stringstream err; + err << "Instruction referenced unknown value " << input_name << "!"; + throw std::runtime_error(err.str()); + } + } + + // Run the specified operation + at::Tensor result; + const auto& op = std::get<0>(instr); + if (op == "add") { + if (inputs.size() != 2) { + throw std::runtime_error("Unexpected number of inputs for add op!"); + } + result = inputs[0] + inputs[1]; + } else if (op == "mul") { + if (inputs.size() != 2) { + throw std::runtime_error("Unexpected number of inputs for mul op!"); + } + result = inputs[0] * inputs[1]; + } else { + std::stringstream err; + err << "Unknown operator " << op << "!"; + throw std::runtime_error(err.str()); + } + + // Write back result into environment + const auto& output_name = std::get<2>(instr); + environment[output_name] = std::move(result); + } + + if (!output_name_) { + throw std::runtime_error("Output name not specififed!"); + } + + return environment.at(*output_name_); + } + + // Ser/De infrastructure. See + // https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html#defining-serialization-deserialization-methods-for-custom-c-classes + // for more info. + + // This is the type we will use to marshall information on disk during + // ser/de. It is a simple tuple composed of primitive types and simple + // collection types like vector, optional, and dict. + using SerializationType = std::tuple< + std::vector /*input_names_*/, + c10::optional /*output_name_*/, + c10::Dict /*constants_*/, + std::vector /*instructions_*/ + >; + + // This function yields the SerializationType instance for `this`. + SerializationType __getstate__() const { + return SerializationType{ + input_names_, output_name_, constants_, instructions_}; + } + + // This function will create an instance of `ElementwiseInterpreter` given + // an instance of `SerializationType`. + static c10::intrusive_ptr __setstate__( + SerializationType state) { + auto instance = c10::make_intrusive(); + std::tie( + instance->input_names_, + instance->output_name_, + instance->constants_, + instance->instructions_) = std::move(state); + return instance; + } + + // Class members + std::vector input_names_; + c10::optional output_name_; + c10::Dict constants_; + std::vector instructions_; +}; + +TORCH_LIBRARY(_TorchScriptTesting, m) { + m.class_("_Foo") + .def(torch::init()) + // .def(torch::init<>()) + .def("info", &Foo::info) + .def("increment", &Foo::increment) + .def("add", &Foo::add) + .def("combine", &Foo::combine); + + m.class_("_NoInit").def( + "get_x", [](const c10::intrusive_ptr& self) { return self->x; }); + + m.class_>("_StackString") + .def(torch::init>()) + .def("push", &MyStackClass::push) + .def("pop", &MyStackClass::pop) + .def("clone", &MyStackClass::clone) + .def("merge", &MyStackClass::merge) + .def_pickle( + [](const c10::intrusive_ptr>& self) { + return self->stack_; + }, + [](std::vector state) { // __setstate__ + return c10::make_intrusive>( + std::vector{"i", "was", "deserialized"}); + }) + .def("return_a_tuple", &MyStackClass::return_a_tuple) + .def( + "top", + [](const c10::intrusive_ptr>& self) + -> std::string { return self->stack_.back(); }) + .def( + "__str__", + [](const c10::intrusive_ptr>& self) { + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < self->stack_.size(); ++i) { + ss << self->stack_[i]; + if (i != self->stack_.size() - 1) { + ss << ", "; + } + } + ss << "]"; + return ss.str(); + }); + // clang-format off + // The following will fail with a static assert telling you you have to + // take an intrusive_ptr as the first argument. + // .def("foo", [](int64_t a) -> int64_t{ return 3;}); + // clang-format on + + m.class_("_PickleTester") + .def(torch::init>()) + .def_pickle( + [](c10::intrusive_ptr self) { // __getstate__ + return std::vector{1, 3, 3, 7}; + }, + [](std::vector state) { // __setstate__ + return c10::make_intrusive(std::move(state)); + }) + .def( + "top", + [](const c10::intrusive_ptr& self) { + return self->vals.back(); + }) + .def("pop", [](const c10::intrusive_ptr& self) { + auto val = self->vals.back(); + self->vals.pop_back(); + return val; + }); + + m.def( + "take_an_instance(__torch__.torch.classes._TorchScriptTesting._PickleTester x) -> Tensor Y", + take_an_instance); + // test that schema inference is ok too + m.def("take_an_instance_inferred", take_an_instance); + + m.class_("_ElementwiseInterpreter") + .def(torch::init<>()) + .def("set_instructions", &ElementwiseInterpreter::setInstructions) + .def("add_constant", &ElementwiseInterpreter::addConstant) + .def("set_input_names", &ElementwiseInterpreter::setInputNames) + .def("set_output_name", &ElementwiseInterpreter::setOutputName) + .def("__call__", &ElementwiseInterpreter::__call__) + .def_pickle( + /* __getstate__ */ + [](const c10::intrusive_ptr& self) { + return self->__getstate__(); + }, + /* __setstate__ */ + [](ElementwiseInterpreter::SerializationType state) { + return ElementwiseInterpreter::__setstate__(std::move(state)); + }); +} + +} // namespace diff --git a/test/cpp/jit/test_custom_class_registrations.h b/test/cpp/jit/test_custom_class_registrations.h new file mode 100644 index 000000000000..4e6b7bd43883 --- /dev/null +++ b/test/cpp/jit/test_custom_class_registrations.h @@ -0,0 +1,36 @@ +#include +#include + +namespace torch { +namespace jit { + +template +struct MyStackClass : torch::CustomClassHolder { + std::vector stack_; + MyStackClass(std::vector init) : stack_(init.begin(), init.end()) {} + + void push(T x) { + stack_.push_back(x); + } + T pop() { + auto val = stack_.back(); + stack_.pop_back(); + return val; + } + + c10::intrusive_ptr clone() const { + return c10::make_intrusive(stack_); + } + + void merge(const c10::intrusive_ptr& c) { + for (auto& elem : c->stack_) { + push(elem); + } + } + + std::tuple return_a_tuple() const { + return std::make_tuple(1337.0f, 123); + } +}; +} // namespace jit +} // namespace torch diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp index 529b36385bd4..d3f61268e8f1 100644 --- a/test/cpp/jit/test_custom_operators.cpp +++ b/test/cpp/jit/test_custom_operators.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/ir/alias_analysis.h" @@ -11,134 +12,135 @@ namespace torch { namespace jit { -void testCustomOperators() { - { - torch::RegisterOperators reg( - "foo::bar", [](double a, at::Tensor b) { return a + b; }); - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, InferredSchema) { + torch::RegisterOperators reg( + "foo::bar", [](double a, at::Tensor b) { return a + b; }); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar")); + ASSERT_EQ(ops.size(), 1); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar"); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar"); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "_0"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "_1"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } - { - torch::RegisterOperators reg( - "foo::bar_with_schema(float a, Tensor b) -> Tensor", - [](double a, at::Tensor b) { return a + b; }); + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); +} - auto& ops = - getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, ExplicitSchema) { + torch::RegisterOperators reg( + "foo::bar_with_schema(float a, Tensor b) -> Tensor", + [](double a, at::Tensor b) { return a + b; }); - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); + auto& ops = + getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::bar_with_schema"); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } - { - // Check that lists work well. - torch::RegisterOperators reg( - "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", - [](torch::List ints, - torch::List floats, - torch::List tensors) { return floats; }); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists"); - - ASSERT_EQ(op->schema().arguments().size(), 3); - ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); - ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); - ASSERT_TRUE( - op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); - ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); - - Stack stack; - push(stack, c10::List({1, 2})); - push(stack, c10::List({1.0, 2.0})); - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); - - ASSERT_EQ(output.size(), 2); - ASSERT_EQ(output.get(0), 1.0); - ASSERT_EQ(output.get(1), 2.0); - } - { - torch::RegisterOperators reg( - "foo::lists2(Tensor[] tensors) -> Tensor[]", - [](torch::List tensors) { return tensors; }); + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); +} - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); - ASSERT_EQ(ops.size(), 1); +TEST(CustomOperatorTest, ListParameters) { + // Check that lists work well. + torch::RegisterOperators reg( + "foo::lists(int[] ints, float[] floats, Tensor[] tensors) -> float[]", + [](torch::List ints, + torch::List floats, + torch::List tensors) { return floats; }); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists"); + + ASSERT_EQ(op->schema().arguments().size(), 3); + ASSERT_EQ(op->schema().arguments()[0].name(), "ints"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofInts())); + ASSERT_EQ(op->schema().arguments()[1].name(), "floats"); + ASSERT_TRUE( + op->schema().arguments()[1].type()->isSubtypeOf(ListType::ofFloats())); + ASSERT_EQ(op->schema().arguments()[2].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[2].type()->isSubtypeOf(ListType::ofTensors())); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofFloats())); + + Stack stack; + push(stack, c10::List({1, 2})); + push(stack, c10::List({1.0, 2.0})); + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 2); + ASSERT_EQ(output.get(0), 1.0); + ASSERT_EQ(output.get(1), 2.0); +} - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foo::lists2"); +TEST(CustomOperatorTest, ListParameters2) { + torch::RegisterOperators reg( + "foo::lists2(Tensor[] tensors) -> Tensor[]", + [](torch::List tensors) { return tensors; }); - ASSERT_EQ(op->schema().arguments().size(), 1); - ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); - ASSERT_TRUE( - op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2")); + ASSERT_EQ(ops.size(), 1); - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_TRUE( - op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foo::lists2"); - Stack stack; - push(stack, c10::List({at::ones(5)})); - op->getOperation()(&stack); - c10::List output; - pop(stack, output); + ASSERT_EQ(op->schema().arguments().size(), 1); + ASSERT_EQ(op->schema().arguments()[0].name(), "tensors"); + ASSERT_TRUE( + op->schema().arguments()[0].type()->isSubtypeOf(ListType::ofTensors())); - ASSERT_EQ(output.size(), 1); - ASSERT_TRUE(output.get(0).allclose(at::ones(5))); - } + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_TRUE( + op->schema().returns()[0].type()->isSubtypeOf(ListType::ofTensors())); + + Stack stack; + push(stack, c10::List({at::ones(5)})); + op->getOperation()(&stack); + c10::List output; + pop(stack, output); + + ASSERT_EQ(output.size(), 1); + ASSERT_TRUE(output.get(0).allclose(at::ones(5))); } -void testCustomOperatorAliasing() { +TEST(CustomOperatorTest, Aliasing) { torch::RegisterOperators reg( "foo::aliasing", [](at::Tensor a, at::Tensor b) -> at::Tensor { a.add_(b); @@ -182,77 +184,65 @@ graph(%x: Tensor, %y: Tensor): } } -void testIValueKWargs() { - const auto text = R"( - def foo(a : int, b : int, c : int = 4): - return a + 2*b + 3*c - )"; - auto cu = compile(text); - auto result = cu->get_function("foo")({1}, {{"b", 3}}); - ASSERT_EQ(result.toInt(), 19); -} - -void testTemplatedOperatorCreator() { - constexpr char op_list[] = "foofoo::bar.template;foo::another"; +static constexpr char op_list[] = "foofoo::bar.template;foo::another"; #define TORCH_SELECTIVE_NAME_IN_SCHEMA(l, n) \ torch::detail::SelectiveStr(n) - { - // Try to register an op name that does not exist in op_list. - // Expected: the op name is not registered. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); - ASSERT_EQ(ops.size(), 0); - } +TEST(TestCustomOperator, OperatorGeneratorUndeclared) { + // Try to register an op name that does not exist in op_list. + // Expected: the op name is not registered. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::not_exist(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist")); + ASSERT_EQ(ops.size(), 0); +} - { - // The operator should be successfully registered since its name is in the - // whitelist. - torch::jit::RegisterOperators reg({OperatorGenerator( - TORCH_SELECTIVE_NAME_IN_SCHEMA( - op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), - [](Stack* stack) { - double a; - at::Tensor b; - pop(stack, a, b); - push(stack, a + b); - }, - aliasAnalysisFromSchema())}); - - auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); - ASSERT_EQ(ops.size(), 1); - - auto& op = ops.front(); - ASSERT_EQ(op->schema().name(), "foofoo::bar"); - - ASSERT_EQ(op->schema().arguments().size(), 2); - ASSERT_EQ(op->schema().arguments()[0].name(), "a"); - ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); - ASSERT_EQ(op->schema().arguments()[1].name(), "b"); - ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); - - ASSERT_EQ(op->schema().returns().size(), 1); - ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); - - Stack stack; - push(stack, 2.0f, at::ones(5)); - op->getOperation()(&stack); - at::Tensor output; - pop(stack, output); - - ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); - } +TEST(TestCustomOperator, OperatorGeneratorBasic) { + // The operator should be successfully registered since its name is in the + // whitelist. + torch::jit::RegisterOperators reg({OperatorGenerator( + TORCH_SELECTIVE_NAME_IN_SCHEMA( + op_list, "foofoo::bar.template(float a, Tensor b) -> Tensor"), + [](Stack* stack) { + double a; + at::Tensor b; + pop(stack, a, b); + push(stack, a + b); + }, + aliasAnalysisFromSchema())}); + + auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar")); + ASSERT_EQ(ops.size(), 1); + + auto& op = ops.front(); + ASSERT_EQ(op->schema().name(), "foofoo::bar"); + + ASSERT_EQ(op->schema().arguments().size(), 2); + ASSERT_EQ(op->schema().arguments()[0].name(), "a"); + ASSERT_EQ(op->schema().arguments()[0].type()->kind(), TypeKind::FloatType); + ASSERT_EQ(op->schema().arguments()[1].name(), "b"); + ASSERT_EQ(op->schema().arguments()[1].type()->kind(), TypeKind::TensorType); + + ASSERT_EQ(op->schema().returns().size(), 1); + ASSERT_EQ(op->schema().returns()[0].type()->kind(), TypeKind::TensorType); + + Stack stack; + push(stack, 2.0f, at::ones(5)); + op->getOperation()(&stack); + at::Tensor output; + pop(stack, output); + + ASSERT_TRUE(output.allclose(at::full(5, 3.0f))); } } // namespace jit diff --git a/test/cpp/jit/test_dce.cpp b/test/cpp/jit/test_dce.cpp index 5799913c316a..6f9161d0d9ae 100644 --- a/test/cpp/jit/test_dce.cpp +++ b/test/cpp/jit/test_dce.cpp @@ -1,12 +1,12 @@ -#include -#include +#include +#include #include #include namespace torch { namespace jit { -void testDCE() { +TEST(EliminateDeadCodeTest, Basic) { auto graph = std::make_shared(); // Consider the following loop: diff --git a/test/cpp/jit/test_fuser.cpp b/test/cpp/jit/test_fuser.cpp index ee0ea060f02f..ef595215b882 100644 --- a/test/cpp/jit/test_fuser.cpp +++ b/test/cpp/jit/test_fuser.cpp @@ -1,4 +1,4 @@ -#include "test/cpp/jit/test_base.h" +#include #include #include "ATen/core/interned_strings.h" @@ -56,28 +56,27 @@ namespace torch { namespace jit { -void testFusion() { - auto testSimple = [&] { - const auto graph_string = R"IR( +TEST(FuserTest, TestSimple_CUDA) { + const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): %2 : Tensor = aten::mul(%0, %1) return (%2))IR"; - Graph graph; - torch::jit::parseIR(graph_string, &graph); - - auto a = at::rand({3, 4}, at::kCUDA); - auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); - auto o = at::zeros({3, 4}, at::kCUDA); - auto outputs = debugLaunchGraph(graph, {a, b}); - ASSERT_EQ(outputs.size(), 1); - auto o2 = a * b; - float max_diff = (o2 - outputs[0]).abs().max().item(); - // std::cout << "max diff: " << max_diff << "\n"; - ASSERT_EQ(max_diff, 0); - }; - testSimple(); + Graph graph; + torch::jit::parseIR(graph_string, &graph); + + auto a = at::rand({3, 4}, at::kCUDA); + auto b = at::rand({4, 3}, at::kCUDA).transpose(0, 1); + auto o = at::zeros({3, 4}, at::kCUDA); + auto outputs = debugLaunchGraph(graph, {a, b}); + ASSERT_EQ(outputs.size(), 1); + auto o2 = a * b; + float max_diff = (o2 - outputs[0]).abs().max().item(); + // std::cout << "max diff: " << max_diff << "\n"; + ASSERT_EQ(max_diff, 0); +} +TEST(FuserTest, TestOne_CUDA) { auto testOne = [&](int ti, int tj) { const auto graph_string = R"IR( graph(%0 : Tensor, @@ -132,7 +131,9 @@ void testFusion() { testOne(0, 1); testOne(1, 2); testOne(0, 2); +} +TEST(FuserTest, FusedConcat_CUDA) { const auto graph_string0 = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -175,7 +176,7 @@ void testFusion() { }; } -void testFusionAliasing() { +TEST(FuserTest, FusionAliasing) { const auto graph_string = R"IR( graph(%0 : Tensor, %1 : Tensor): @@ -200,7 +201,7 @@ void testFusionAliasing() { ->run(*g); } -void testRegisterFusionCachesKernel() { +TEST(FuserTest, KernelCaching) { // Constructs two functionally equivalent graphs const auto graph0_string = R"IR( graph(%0 : Float(2, 3, 4), diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index 80fa318d653a..38008d417256 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -1,8 +1,8 @@ #if defined(USE_CUDA) - -#include +#include #include +#include #include #include #include @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -73,11 +74,11 @@ TensorView* makeTensorWithContig( } void checkIntValue( - const EvaluationContext* eval_context, + StatefulExpressionEvaluator& evaluator, Val* val, Int::ScalarType expected_value) { TORCH_CHECK(val->isAnInt()); - const auto actual_value = ExpressionEvaluator::evaluate(val, eval_context); + const auto actual_value = evaluator.inferValue(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } @@ -91,7 +92,7 @@ void checkIntValue( // (These tests exercise IrGraphGenerator through a non-trivial IR, // to make sure that it runs w/o crashing. The actual output is not // validated) -void testGPU_IrGraphGenerator() { +TEST(NVFuserTest, IrGraphGenerator_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -143,7 +144,7 @@ void testGPU_IrGraphGenerator() { .empty()); } -void testGPU_FusionDispatch() { +TEST(NVFuserTest, FusionDispatch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -158,28 +159,28 @@ void testGPU_FusionDispatch() { } // Evaluate basic scalar operations with constant values -void testGPU_FusionExprEvalConstants() { +TEST(NVFuserTest, FusionExprEvalConstants_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(7); auto* b = new Int(3); - checkIntValue(&eval_context, neg(a), -7); - checkIntValue(&eval_context, add(a, b), 10); - checkIntValue(&eval_context, neg(mul(sub(a, b), div(a, b))), -8); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); + checkIntValue(evaluator, neg(a), -7); + checkIntValue(evaluator, add(a, b), 10); + checkIntValue(evaluator, neg(mul(sub(a, b), div(a, b))), -8); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); } // Evaluate basic scalar operations with bound values -void testGPU_FusionExprEvalBindings() { +TEST(NVFuserTest, FusionExprEvalBindings_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); auto* a = new Int(); auto* b = new Int(); @@ -188,39 +189,39 @@ void testGPU_FusionExprEvalBindings() { auto* e = new Int(0); // trying to evaluate before binding should give empty results - TORCH_CHECK(!ExpressionEvaluator::evaluate(a, &eval_context).has_value()); - TORCH_CHECK(!ExpressionEvaluator::evaluate(d, &eval_context).has_value()); + TORCH_CHECK(!evaluator.inferValue(a).has_value()); + TORCH_CHECK(!evaluator.inferValue(d).has_value()); - eval_context.bind(a, 7); - eval_context.bind(b, 3); + evaluator.safeBind(a, 7); + evaluator.safeBind(b, 3); // can't bind to the results of expressions - ASSERT_ANY_THROW(eval_context.bind(c, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(c, 100)); // can't bind to concrete values - ASSERT_ANY_THROW(eval_context.bind(e, 100)); + ASSERT_ANY_THROW(evaluator.safeBind(e, 100)); - checkIntValue(&eval_context, c, 10); - checkIntValue(&eval_context, sub(a, b), 4); - checkIntValue(&eval_context, mod(a, b), 1); - checkIntValue(&eval_context, ceilDiv(a, b), 3); - checkIntValue(&eval_context, d, -4); + checkIntValue(evaluator, c, 10); + checkIntValue(evaluator, sub(a, b), 4); + checkIntValue(evaluator, mod(a, b), 1); + checkIntValue(evaluator, ceilDiv(a, b), 3); + checkIntValue(evaluator, d, -4); // Reset evaluation context - eval_context = EvaluationContext(&fusion); + evaluator = StatefulExpressionEvaluator(&fusion); - eval_context.bind(a, 2); - eval_context.bind(b, 5); + evaluator.safeBind(a, 2); + evaluator.safeBind(b, 5); - checkIntValue(&eval_context, c, 7); - checkIntValue(&eval_context, sub(a, b), -3); - checkIntValue(&eval_context, mod(a, b), 2); - checkIntValue(&eval_context, ceilDiv(a, b), 1); - checkIntValue(&eval_context, d, -2); + checkIntValue(evaluator, c, 7); + checkIntValue(evaluator, sub(a, b), -3); + checkIntValue(evaluator, mod(a, b), 2); + checkIntValue(evaluator, ceilDiv(a, b), 1); + checkIntValue(evaluator, d, -2); } // Evaluate expressions in a simple IR -void testGPU_FusionExprEvalBasic() { +TEST(NVFuserTest, FusionExprEvalBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -247,8 +248,8 @@ void testGPU_FusionExprEvalBasic() { tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values // @@ -258,25 +259,25 @@ void testGPU_FusionExprEvalBasic() { // (ex. `tv0->getRootDomain()[0]->extent()` // instead of `tv0->axis(0)->extent()`) // - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); } // Evaluate expressions in a more complex IR -void testGPU_FusionExprEvalComplex() { +TEST(NVFuserTest, FusionExprEvalComplex_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -298,37 +299,37 @@ void testGPU_FusionExprEvalComplex() { tv6->split(0, 5); tv5->merge(0); - // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + // 1. Create an evaluator + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 129); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 127); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 129); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 127); // Evaluate and check extent values TORCH_CHECK(tv0->domain()->nDims() == 2); - checkIntValue(&eval_context, tv0->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv0->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv0->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv0->axis(1)->rawExtent(), 127); TORCH_CHECK(tv3->domain()->nDims() == 2); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 127); TORCH_CHECK(tv4->domain()->nDims() == 2); - checkIntValue(&eval_context, tv4->axis(0)->rawExtent(), 129); - checkIntValue(&eval_context, tv4->axis(1)->rawExtent(), 127); + checkIntValue(evaluator, tv4->axis(0)->rawExtent(), 129); + checkIntValue(evaluator, tv4->axis(1)->rawExtent(), 127); TORCH_CHECK(tv5->domain()->nDims() == 1); - checkIntValue(&eval_context, tv5->axis(0)->rawExtent(), 16383); + checkIntValue(evaluator, tv5->axis(0)->rawExtent(), 16383); TORCH_CHECK(tv6->domain()->nDims() == 3); - checkIntValue(&eval_context, tv6->axis(0)->rawExtent(), 26); - checkIntValue(&eval_context, tv6->axis(1)->rawExtent(), 5); - checkIntValue(&eval_context, tv6->axis(2)->rawExtent(), 127); + checkIntValue(evaluator, tv6->axis(0)->rawExtent(), 26); + checkIntValue(evaluator, tv6->axis(1)->rawExtent(), 5); + checkIntValue(evaluator, tv6->axis(2)->rawExtent(), 127); } // Evaluate expressions post lowering -void testGPU_FusionExprEvalPostLower() { +TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -360,34 +361,32 @@ void testGPU_FusionExprEvalPostLower() { // Lower GpuLower gpulw(&fusion); - std::stringstream kernel; - gpulw.printKernel(kernel); // 1. Create an evaluation context - EvaluationContext eval_context(&fusion); + StatefulExpressionEvaluator evaluator(&fusion); // 2. Bind values - eval_context.bind(tv0->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv0->getRootDomain()[1]->extent(), 128); - eval_context.bind(tv1->getRootDomain()[0]->extent(), 6); - eval_context.bind(tv1->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv0->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv0->getRootDomain()[1]->extent(), 128); + evaluator.safeBind(tv1->getRootDomain()[0]->extent(), 6); + evaluator.safeBind(tv1->getRootDomain()[1]->extent(), 128); // 3. Evaluate and check result values TORCH_CHECK(tv2->domain()->nDims() == 3); - checkIntValue(&eval_context, tv2->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv2->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv2->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv2->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv2->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv2->axis(2)->rawExtent(), 128); TORCH_CHECK(tv3->domain()->nDims() == 3); - checkIntValue(&eval_context, tv3->axis(0)->rawExtent(), 2); - checkIntValue(&eval_context, tv3->axis(1)->rawExtent(), 4); - checkIntValue(&eval_context, tv3->axis(2)->rawExtent(), 128); + checkIntValue(evaluator, tv3->axis(0)->rawExtent(), 2); + checkIntValue(evaluator, tv3->axis(1)->rawExtent(), 4); + checkIntValue(evaluator, tv3->axis(2)->rawExtent(), 128); - checkIntValue(&eval_context, bid_x, 2); - checkIntValue(&eval_context, tid_x, 128); + checkIntValue(evaluator, bid_x, 2); + checkIntValue(evaluator, tid_x, 128); } -void testGPU_FusionClear() { +TEST(NVFuserTest, FusionClear_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -467,7 +466,7 @@ void testGPU_FusionClear() { TORCH_CHECK(output_ref.equal(outputs[0])); } -void testGPU_FusionCopy() { +TEST(NVFuserTest, FusionCopy_CUDA) { Fusion original_fusion; // Create the test IR @@ -505,10 +504,12 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_ir.str(), clone_ir.str()); // Lower original fusion - std::stringstream original_kernel; + std::string original_kernel; { - GpuLower lower(&original_fusion); - lower.printKernel(original_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&original_fusion); + original_kernel = + codegen::generateCudaKernel(GpuLower(&original_fusion).kernel()); } // Make sure the "before lowering" clone was not mutated @@ -529,15 +530,17 @@ void testGPU_FusionCopy() { ASSERT_EQ(original_lowered_ir.str(), clone_lowered_ir.str()); // Lower the "before lowering" and compare kernels - std::stringstream clone_kernel; + std::string clone_kernel; { - GpuLower lower(&before_lowering); - lower.printKernel(clone_kernel); + // TODO(kir): remove this guard once we implement the cuda codegen visitor + FusionGuard fg(&before_lowering); + clone_kernel = + codegen::generateCudaKernel(GpuLower(&before_lowering).kernel()); } - ASSERT_EQ(original_kernel.str(), clone_kernel.str()); + ASSERT_EQ(original_kernel, clone_kernel); } -void testGPU_FusionMove() { +TEST(NVFuserTest, FusionMove_CUDA) { Fusion fusion; // Create the test IR @@ -593,9 +596,7 @@ void testGPU_FusionMove() { ASSERT_EQ(original_ir.str(), another_ir.str()); // Lower the fusion IR - std::stringstream kernel; GpuLower lower(&another_fusion); - lower.printKernel(kernel); std::stringstream lowered_ir; lowered_ir << another_fusion; @@ -609,7 +610,7 @@ void testGPU_FusionMove() { ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str()); } -void testGPU_FusionSimpleArith() { +TEST(NVFuserTest, FusionSimpleArith_CUDA) { std::stringstream ss1, ss2; Fusion fusion; @@ -638,7 +639,7 @@ void testGPU_FusionSimpleArith() { "Error where explicit add nodes don't match implicit add nodes."); } -void testGPU_FusionSimpleTypePromote() { +TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -661,7 +662,7 @@ class ZeroMutator : public OptOutMutator { } }; -void testGPU_FusionMutator() { +TEST(NVFuserTest, FusionMutator_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -679,7 +680,7 @@ void testGPU_FusionMutator() { TORCH_CHECK(flhs->value().value() == 0.f); } -void testGPU_FusionRegister() { +TEST(NVFuserTest, FusionRegister_CUDA) { Fusion fusion; FusionGuard fg(&fusion); Float* v1 = new Float{1.f}; @@ -710,7 +711,7 @@ struct DummyExpr : public Expr { DummyExpr& operator=(DummyExpr&& other) = delete; }; -void testGPU_FusionTopoSort() { +TEST(NVFuserTest, FusionTopoSort_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -777,7 +778,7 @@ void testGPU_FusionTopoSort() { TORCH_CHECK(fusion.origin(v6)->name() == 3); } -void testGPU_FusionTensor() { +TEST(NVFuserTest, FusionTensor_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); Fusion fusion; @@ -799,48 +800,6 @@ void testGPU_FusionTensor() { } } - { - auto tensor = at::randn({2, 1, 4}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[2]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[0]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - } - - { - auto tensor = at::randn({2, 3, 1}, options); - auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); - TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); - TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); - TORCH_CHECK(fuser_tensor->domain() != nullptr); - for (int i = 0; i < static_cast(fuser_tensor->nDims()); i++) { - // size 1 dimension are makred as broadcast - TORCH_CHECK( - fuser_tensor->axis(i)->isBroadcast() == (tensor.sizes()[i] == 1)); - } - TORCH_CHECK(fuser_tensor->domain()->contiguity()[0]); - - // temporary WAR to disable contig & bcast; issue # 230 - // TODO: insert the check where broadcast & contiguous cannot be marked - // together - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[1]); - TORCH_CHECK(!fuser_tensor->domain()->contiguity()[2]); - } - // TensorType::create fills stride_properties, which helps us to mark // IterDomain properly // Note: implementation could change, depending on how much we want to invest @@ -883,7 +842,7 @@ void testGPU_FusionTensor() { } } -void testGPU_FusionFilterVals() { +TEST(NVFuserTest, FusionFilterVals_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -921,7 +880,7 @@ void testGPU_FusionFilterVals() { "Not expecting any results"); } -void testGPU_FusionTVSplit() { +TEST(NVFuserTest, FusionTVSplit_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -947,7 +906,7 @@ void testGPU_FusionTVSplit() { static_cast(inner->extent())->value().value() == 2); } -void testGPU_FusionTVMerge() { +TEST(NVFuserTest, FusionTVMerge_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -965,7 +924,7 @@ void testGPU_FusionTVMerge() { tv->getRootDomain()[2]->extent()); } -void testGPU_FusionTVReorder() { +TEST(NVFuserTest, FusionTVReorder_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1012,7 +971,7 @@ void testGPU_FusionTVReorder() { TORCH_CHECK(ref[1]->sameAs(tv->axis(1))); } -void testGPU_FusionEquality() { +TEST(NVFuserTest, FusionEquality_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1053,7 +1012,7 @@ void testGPU_FusionEquality() { TORCH_CHECK(!neg1->sameAs(neg2)); } -void testGPU_FusionDependency() { +TEST(NVFuserTest, FusionDependency_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1123,7 +1082,7 @@ void testGPU_FusionDependency() { TORCH_CHECK(dep_chain.empty()); } -void testGPU_FusionParser() { +TEST(NVFuserTest, FusionParser_CUDA) { auto g = std::make_shared(); const auto graph0_string = R"IR( graph(%0 : Float(2:1), @@ -1156,43 +1115,36 @@ void testGPU_FusionParser() { // 1. this can be moved to a dedicated "golden" file // 2. use a fuzzy compare (ignore non-significant whitespaces for example) const std::string expected_kernel = R"( -__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3){ - float T2[4]; - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; +__global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { + float T2[1]; + if ((((((blockIdx.x * 1) + (1 - 1)) * 128) + threadIdx.x) < T0.size[0])) { + for(size_t i6 = 0; i6 < 1; ++i6) { + T2[i6] + = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; + T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + = T2[i6] + * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } } else { - for(size_t i6 = 0; i6 < 4; ++i6 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T2[ i6 ] - = T0[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ] - * T1[ ( ( ( ( blockIdx.x * 4 ) + i6 ) * 128 ) + threadIdx.x ) ]; + for(size_t i6 = 0; i6 < 1; ++i6) { + if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) { + T2[i6] + = T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + * T1[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } - } - } - if ( ( ( ( ( ( blockIdx.x * 4 ) + ( 4 - 1 ) ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; - } - } else { - for(size_t i13 = 0; i13 < 4; ++i13 ) { - if ( ( ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) < T0.size[0] ) ) { - T3[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ] - = T2[ i13 ] - * T0[ ( ( ( ( blockIdx.x * 4 ) + i13 ) * 128 ) + threadIdx.x ) ]; + if ((((((blockIdx.x * 1) + i6) * 128) + threadIdx.x) < T0.size[0])) { + T3[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)] + = T2[i6] + * T0[((((blockIdx.x * 1) + i6) * 128) + threadIdx.x)]; } } } } )"; - std::string actual_kernel = GpuLower(fusion.get()).getKernel(); - actual_kernel = "\n" + actual_kernel; + const std::string actual_kernel = + "\n" + codegen::generateCudaKernel(GpuLower(fusion.get()).kernel()); if (expected_kernel.size() != actual_kernel.size() || expected_kernel.compare(actual_kernel) != 0) { std::cerr @@ -1210,7 +1162,7 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Te TORCH_CHECK(output_ref.equal(outputs[0])); } -void testGPU_FusionForLoop() { +TEST(NVFuserTest, FusionForLoop_CUDA) { // TODO(kir): re-enable this test // due to the current "GpuLower guard" approach, we can only create // kernel IR during GpuLower::lower() @@ -1251,7 +1203,7 @@ void testGPU_FusionForLoop() { #endif } -void testGPU_FusionCodeGen() { +TEST(NVFuserTest, FusionCodeGen_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1288,7 +1240,7 @@ void testGPU_FusionCodeGen() { TORCH_CHECK(output_ref.equal(output)); } -void testGPU_FusionCodeGen2() { +TEST(NVFuserTest, FusionCodeGen2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1330,7 +1282,7 @@ void testGPU_FusionCodeGen2() { TORCH_CHECK(output_ref.equal(outputs[0])); } -void testGPU_FusionSimplePWise() { +TEST(NVFuserTest, FusionSimplePWise_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // dimensionality of the problem @@ -1387,7 +1339,7 @@ void testGPU_FusionSimplePWise() { TORCH_CHECK(output_ref.equal(output)); } -void testGPU_FusionExecKernel() { +TEST(NVFuserTest, FusionExecKernel_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1441,7 +1393,7 @@ int ceilDiv_(int a, int b) { return (a + b - 1) / b; } -void testGPU_FusionAdvancedComputeAt() { +TEST(NVFuserTest, FusionAdvancedComputeAt_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 @@ -1576,11 +1528,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t5), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t5)); TORCH_CHECK(at::allclose(outputs[1], t6)); } @@ -1636,11 +1584,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); fe.runFusion({t0, t1}, {kernel_tv3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(kernel_tv3, t3), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); } // Case 4 @@ -1706,11 +1650,7 @@ void testGPU_FusionAdvancedComputeAt() { fe.compileFusion(&fusion); auto outputs = fe.runFusion({t0, t1, t2, t3}); - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); - - TORCH_CHECK(at::allclose(outputs[0], t6), actual_kernel.str()); + TORCH_CHECK(at::allclose(outputs[0], t6)); } // Case 5 @@ -1752,177 +1692,716 @@ void testGPU_FusionAdvancedComputeAt() { } } -void testGPU_FusionScalarInputs() { +TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 Fusion fusion; FusionGuard fg(&fusion); - TensorView* tv0 = makeDummyTensor(2); + TensorView* tv0 = makeDummyTensor(1); fusion.addInput(tv0); - TensorView* tv1 = makeDummyTensor(2); - fusion.addInput(tv1); - Float* f0 = new Float(); - fusion.addInput(f0); - Float* f1 = new Float(); - fusion.addInput(f1); - Float* f2 = new Float(); - fusion.addInput(f2); - Float* f3 = new Float(); - fusion.addInput(f3); - Val* f4 = mul(f0, f1); - Val* f5 = sub(f2, f3); + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + fusion.addOutput(tv2); + fusion.addOutput(tv3); - TensorView* tv2 = sub(tv1, f4); - TensorView* tv3 = add(tv0, f5); - TensorView* tv4 = mul(tv3, tv2); + // This computeAt will affect tv2 as well, even though tv2 is not in + // the data-flow path between tv1 and tv3. The reason is that tv1 is + // now computed at tv3, so tv2 must also be computed at the same + // location. Overall, what will happen is basically we merge + // expressions of all tensors and compute them in a single loop + // nest. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + // Note that tv2 is also computed at tv3. + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + TORCH_CHECK(!tv3->hasComputeAt()); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + + at::Tensor kernel_tv2 = at::empty_like(t0, options); + at::Tensor kernel_tv3 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv2, kernel_tv3}); + + TORCH_CHECK(at::allclose(kernel_tv2, t2)); + TORCH_CHECK(at::allclose(kernel_tv3, t3)); +} +// Similar to ComputeAtMultiConsumers, but with a common consumer. +TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + fusion.addOutput(tv3); fusion.addOutput(tv4); + fusion.addOutput(tv5); - // Lets setup to actually run - while (tv4->nDims() > 1) - tv4->merge(0); - tv4->split(0, 128); - tv4->split(0, 4); + // Computing tv1 at tv3. This will affect tv2 as discussed in + // ComplexComputeAt1. Additionally, in this case, notice that tv4 is + // the common consumer of tv2 and tv3, so they are computed at + // tv4. The indirect propagation of the computeAt should stop at the + // common consumer, and no further change should occur. More + // specifically, tv4 and tv5 should not have a computeAt tensor. + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } - tv0->computeAt(tv4, 1); - tv1->computeAt(tv4, 1); + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(!tv4->hasComputeAt()); + TORCH_CHECK(!tv5->hasComputeAt()); - tv4->axis(0)->parallelize(ParallelType::BIDx); + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); +} + +TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv3 + tv4 + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + + fusion.addOutput(tv5); + + TensorView* computeAtTarget = tv3; + + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); + + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + // This computeAt will affect all tensors including tv3, tv4 and + // tv5, even though it appears to impact only tv1 and tv2. The + // reason is that tv1 is now computed at tv3, so tv4 must also be + // computed at the same location. Similarly, the consumer of tv4, + // tv5, must also be computed at the same location. Overall, what + // will happen is basically we merge expressions of all tensors and + // compute them in a single loop nest. Internally, this will be + // realized by making all tensors, except for those in the path + // between tv1 and tv3, computed at tv5, which we call the common + // consumer. + tv1->computeAt(computeAtTarget, 1); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(!tv5->hasComputeAt()); for (Val* val : fusion.vals()) { if (!fusion.hasInput(val) && val->getValType().value() == ValType::TensorView) { - TensorView* tv = static_cast(val); - + TensorView* tv = val->as(); tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } } - // f4 = f0 * f1 - // f5 = f2 - f3 - // t2 = t1 - f4 - // t3 = t0 + f5 - // t4 = t3 * t2 - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - float fl0 = 0.1; - float fl1 = -0.2; - float fl2 = 0.3; - float fl3 = -0.4; - float fl4 = fl0 * fl1; - float fl5 = fl2 - fl3; - at::Tensor t0 = at::randn({129, 127}, options); - at::Tensor t1 = at::rand_like(t0, options); - - auto t2 = t1.sub(fl4); - auto t3 = t0.add(fl5); - auto t4 = t3.mul(t2); - at::Tensor kernel_tv4 = at::empty_like(t0, options); + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; - at::Scalar test(fl0); + at::Tensor kernel_tv5 = at::empty_like(t0, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - fe.runFusion( - {t0, - t1, - at::Scalar(fl0), - at::Scalar(fl1), - at::Scalar(fl2), - at::Scalar(fl3)}, - {kernel_tv4}); - - GpuLower gpulw(&fusion); - std::stringstream actual_kernel; - gpulw.printKernel(actual_kernel); + fe.runFusion({t0}, {kernel_tv5}); - TORCH_CHECK(at::allclose(kernel_tv4, t4), actual_kernel.str()); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); } -void testGPU_FusionLoopUnroll() { +// Similar to the above common consumer test but adds an additional +// tensor that has no common consumer with the other tensors. +TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv2 * -1 + // tv4 = tv1 + 4 + // tv5 = tv2 + tv3 + // tv6 = tv1 + 6 Fusion fusion; FusionGuard fg(&fusion); - // Set up your input tensor views - TensorView* tv0 = makeDummyTensor(3); - TensorView* tv1 = makeDummyTensor(3); - - // Register your inputs + TensorView* tv0 = makeDummyTensor(2); fusion.addInput(tv0); - fusion.addInput(tv1); - // Do math with it, it returns a `Val*` but can be static_casted back to - // TensorView - TensorView* tv2 = add(tv1, new Float(2.0)); - TensorView* tv3 = add(tv0, tv2); + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv2, new Float(-1.0)); + TensorView* tv4 = add(tv1, new Float(4.0)); + TensorView* tv5 = add(tv3, tv4); + TensorView* tv6 = add(tv1, new Float(6.0)); - // Register your outputs - fusion.addOutput(tv3); + fusion.addOutput(tv5); + fusion.addOutput(tv6); - int block_size = 16; + TensorView* computeAtTarget = tv3; - tv3->merge(0, 1); - tv3->merge(0, 1); + computeAtTarget->merge(0); + computeAtTarget->split(0, 128); + computeAtTarget->split(0, 4); - tv3->split(0, block_size); - tv3->split(0, 4); + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); - // For all inputs, computeAt the output inline, temporaries should be squeezed - // between them - tv0->computeAt(tv3, 1); - tv1->computeAt(tv3, 1); + // This will have the same impact on the tensors except for tv5 and + // tv6. tv6 does not have any common consumer with the computeAt + // target, but since it uses tv1, it must be also computed at the + // same location as the other impacted tensors. We can either make + // tv5 computed at tv6 or tv6 computed at tv5. In this case, tv5 + // should be computed at tv6 just because the current implementation + // orders the computeAt relationship based on the order in which + // tensors are specified as outputs. - // Parallelize - tv2->axis(1)->parallelize(ParallelType::Unroll); - tv3->axis(1)->parallelize(ParallelType::Unroll); - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(0)->parallelize(ParallelType::BIDx); + tv1->computeAt(computeAtTarget, 1); + + // All tensors should have the same dimenionality as the target + for (Val* val : fusion.vals()) { + if (fusion.hasInput(val) || + val->getValType().value() != ValType::TensorView) { + continue; + } + TensorView* tv = val->as(); + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); + } + + TORCH_CHECK(tv1->getComputeAtView() == tv2); + TORCH_CHECK(tv2->getComputeAtView() == tv3); + + // tv3 and tv4 are computed at tv5 + TORCH_CHECK(tv3->getComputeAtView() == tv5); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + + // tv5 should be computed at tv6 since tv5 is added as an output + // before tv6. If we call fusion.addOutput(tv6) first, tv6 should be + // computed at tv5. + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = val->as(); + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input0 = at::rand({129, 13, 3}, options); - at::Tensor input1 = at::rand({129, 13, 3}, options); + at::Tensor t0 = at::randn({129, 127}, options); + + auto t1 = t0.mul({0.5}); + auto t2 = t1.mul({-1.0}); + auto t3 = t2.mul({-1.0}); + auto t4 = t1.add({4.0}); + auto t5 = t3 + t4; + auto t6 = t1.add({6.0}); + + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input0, input1}); + fe.runFusion({t0}, {kernel_tv5, kernel_tv6}); - TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); } -/* - * Helper function for single op testing that generates a codegen operand - */ +// Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor +// that does not have data dependency with the consumer. +TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { + // tv1 = tv0 * 0.5 + // tv2 = tv1 * -1 + // tv3 = tv1 * -2 + // tv4 = tv2 + tv3 + // tv5 = tv4 * 5 + // tv6 = tv1 * 6 + Fusion fusion; + FusionGuard fg(&fusion); -Val* gen_jit_operand(std::pair desc) { - if (desc.first == ValType::TensorView) { - return makeDummyTensor(2, desc.second); - } else if (desc.first == ValType::Scalar) { - if (desc.second == DataType::Float) - return new Float(); - else if (desc.second == DataType::Int) - return new Int(); - else - TORCH_CHECK("Not currently supported type", desc.first); - } else { - TORCH_CHECK("Not currently supported type", desc.first); + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = mul(tv1, new Float(-1.0)); + TensorView* tv3 = mul(tv1, new Float(-2.0)); + TensorView* tv4 = add(tv2, tv3); + TensorView* tv5 = mul(tv4, new Float(5.0)); + // Notice that tv6 is not a consumer of tv4. + TensorView* tv6 = mul(tv1, new Float(6.0)); + fusion.addOutput(tv3); + fusion.addOutput(tv4); + fusion.addOutput(tv5); + fusion.addOutput(tv6); + + TensorView* computeAtTarget = tv3; + computeAtTarget->split(0, 128); + tv1->computeAt(computeAtTarget, 1); + + TensorView* affected_tensors[] = {tv1, tv2, tv3, tv4, tv6}; + for (auto tv : affected_tensors) { + TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); } - return nullptr; -} -/* - * Helper function for single op testing that generates an ATen operand - */ + TORCH_CHECK(tv1->getComputeAtView() == computeAtTarget); + TORCH_CHECK(tv2->getComputeAtView() == tv4); + TORCH_CHECK(tv3->getComputeAtView() == tv4); + TORCH_CHECK(tv4->getComputeAtView() == tv5); + TORCH_CHECK(tv5->getComputeAtView() == tv6); + TORCH_CHECK(!tv6->hasComputeAt()); -IValue gen_aten_operand( - std::pair desc, - int blocks, - int threads, + computeAtTarget->axis(0)->parallelize(ParallelType::BIDx); + + for (auto tv : affected_tensors) { + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor t0 = at::randn({1000}, options); + + auto t1 = t0 * 0.5; + auto t2 = t1 * -1.0; + auto t3 = t1 * -2.0; + auto t4 = t2 + t3; + auto t5 = t4 * 5.0; + auto t6 = t1 * 6.0; + + at::Tensor kernel_tv3 = at::empty_like(t0, options); + at::Tensor kernel_tv4 = at::empty_like(t0, options); + at::Tensor kernel_tv5 = at::empty_like(t0, options); + at::Tensor kernel_tv6 = at::empty_like(t0, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion({t0}, {kernel_tv3, kernel_tv4, kernel_tv5, kernel_tv6}); + + TORCH_CHECK(at::allclose(kernel_tv3, t3)); + TORCH_CHECK(at::allclose(kernel_tv4, t4)); + TORCH_CHECK(at::allclose(kernel_tv5, t5)); + TORCH_CHECK(at::allclose(kernel_tv6, t6)); +} + +namespace { + +void checkConcretized( + TensorView* v0, + int a0, + TensorView* v1, + int a1, + bool should_concretize) { + if (should_concretize) { + TORCH_CHECK( + IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); + } else { + TORCH_CHECK( + !IterDomain::concretizeDomain(v0->axis(a0))->sameAs(v1->axis(a1))); + } +} + +} // namespace + +TEST(NVFuserTest, FusionBCastConcretizeBasic_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // tv0: [I I] + TensorView* tv0 = makeDummyTensor(2); + + // tv1: [I I I] + TensorView* tv1 = makeDummyTensor(3); + + fusion.addInput(tv0); + fusion.addInput(tv1); + + // tv2*: [B I I] + auto tv2_0 = broadcast(tv0, {true, false, false}); + auto tv2_1 = broadcast(tv0, {true, false, false}); + auto tv2 = add(tv2_0, tv2_1); + + // tv3: [I I I] + auto tv3 = add(tv2, tv1); + + fusion.addOutput(tv3); + + checkConcretized(tv2, 0, tv1, 0, true); + checkConcretized(tv2_0, 0, tv1, 0, true); + checkConcretized(tv2_1, 0, tv1, 0, true); + checkConcretized(tv2_0, 1, tv1, 0, false); + checkConcretized(tv2_0, 0, tv1, 1, false); +} + +TEST(NVFuserTest, FusionBCastConcretizeRfactor_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // both tv0 and tv1 = [I, I] + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + + //[B,I,I] + auto tv2 = broadcast(tv1, {true, false, false}); + + //[B,I,R] + auto tv3 = sum(tv2, {2}); + + auto tv5 = add(tv3, tv1); + + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // scheduling: + //[B,I,R0,R1=128], root = [B,I,R] + tv3->split(2, 128); + + // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] + auto tv4 = tv3->rFactor({3}); + + checkConcretized(tv2, 0, tv5, 0, true); + checkConcretized(tv4, 0, tv5, 0, true); + checkConcretized(tv3, 0, tv5, 0, true); +} + +namespace { + +void checkIdProvedEquivalent( + TensorView* v0, + int a0, + TensorView* v1, + int a1, + bool should_prove) { + if (should_prove) { + TORCH_CHECK(IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1))); + } else { + TORCH_CHECK(!IterDomain::proveEquivalent(v0->axis(a0), v1->axis(a1))); + } +} + +} // namespace + +TEST(NVFuserTest, FusionProveIdEqBasic_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(3); + + fusion.addInput(tv0); + fusion.addInput(tv1); + auto tv3 = broadcast(tv0, {true, false, false}); + auto tv4 = broadcast(tv1, {false, true, false}); + auto tv5 = add(tv3, tv4); + fusion.addOutput(tv5); + + checkIdProvedEquivalent(tv0, 0, tv4, 1, true); + checkIdProvedEquivalent(tv1, 0, tv4, 0, true); + checkIdProvedEquivalent(tv1, 1, tv0, 1, true); + checkIdProvedEquivalent(tv0, 0, tv5, 1, true); + checkIdProvedEquivalent(tv1, 1, tv5, 2, true); + checkIdProvedEquivalent(tv0, 0, tv1, 0, false); + checkIdProvedEquivalent(tv0, 1, tv1, 0, false); + checkIdProvedEquivalent(tv0, 0, tv1, 1, false); +} + +TEST(NVFuserTest, FusionProveIdEqRfactor_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // [I,I] + TensorView* tv0 = makeDummyTensor(2); + // [I,I,I] + TensorView* tv1 = makeDummyTensor(3); + + //[I,I,R] + auto tv2 = sum(tv1, {2}); + + auto tv5 = add(tv2, tv0); + + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // scheduling: + //[B,I,R0,R1=128], root = [B,I,R] + tv2->split(2, 128); + + // root=[B,I,Irf], rfactor=[B,I,Irf,Rrf] + auto tv3 = tv2->rFactor({3}); + + checkIdProvedEquivalent(tv1, 0, tv0, 0, true); + checkIdProvedEquivalent(tv2, 0, tv0, 0, true); + checkIdProvedEquivalent(tv3, 0, tv0, 0, true); +} + +TEST(NVFuserTest, FusionScalarInputs_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + fusion.addInput(tv0); + TensorView* tv1 = makeDummyTensor(2); + fusion.addInput(tv1); + + Float* f0 = new Float(); + fusion.addInput(f0); + Float* f1 = new Float(); + fusion.addInput(f1); + Float* f2 = new Float(); + fusion.addInput(f2); + Float* f3 = new Float(); + fusion.addInput(f3); + Val* f4 = mul(f0, f1); + Val* f5 = sub(f2, f3); + + TensorView* tv2 = sub(tv1, f4); + TensorView* tv3 = add(tv0, f5); + TensorView* tv4 = mul(tv3, tv2); + + fusion.addOutput(tv4); + + // Lets setup to actually run + while (tv4->nDims() > 1) + tv4->merge(0); + tv4->split(0, 128); + tv4->split(0, 4); + + tv0->computeAt(tv4, 1); + tv1->computeAt(tv4, 1); + + tv4->axis(0)->parallelize(ParallelType::BIDx); + + for (Val* val : fusion.vals()) { + if (!fusion.hasInput(val) && + val->getValType().value() == ValType::TensorView) { + TensorView* tv = static_cast(val); + + tv->axis(1)->parallelize(ParallelType::Unroll); + tv->axis(-1)->parallelize(ParallelType::TIDx); + } + } + + // f4 = f0 * f1 + // f5 = f2 - f3 + // t2 = t1 - f4 + // t3 = t0 + f5 + // t4 = t3 * t2 + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + float fl0 = 0.1; + float fl1 = -0.2; + float fl2 = 0.3; + float fl3 = -0.4; + float fl4 = fl0 * fl1; + float fl5 = fl2 - fl3; + + at::Tensor t0 = at::randn({129, 127}, options); + at::Tensor t1 = at::rand_like(t0, options); + + auto t2 = t1.sub(fl4); + auto t3 = t0.add(fl5); + auto t4 = t3.mul(t2); + + at::Tensor kernel_tv4 = at::empty_like(t0, options); + + at::Scalar test(fl0); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + fe.runFusion( + {t0, + t1, + at::Scalar(fl0), + at::Scalar(fl1), + at::Scalar(fl2), + at::Scalar(fl3)}, + {kernel_tv4}); + + TORCH_CHECK(at::allclose(kernel_tv4, t4)); +} + +TEST(NVFuserTest, FusionLoopUnroll_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(3); + TensorView* tv1 = makeDummyTensor(3); + + // Register your inputs + fusion.addInput(tv0); + fusion.addInput(tv1); + + // Do math with it, it returns a `Val*` but can be static_casted back to + // TensorView + TensorView* tv2 = add(tv1, new Float(2.0)); + TensorView* tv3 = add(tv0, tv2); + + // Register your outputs + fusion.addOutput(tv3); + + int block_size = 16; + + tv3->merge(0, 1); + tv3->merge(0, 1); + + tv3->split(0, block_size); + tv3->split(0, 4); + + // For all inputs, computeAt the output inline, temporaries should be squeezed + // between them + tv0->computeAt(tv3, 1); + tv1->computeAt(tv3, 1); + + // Parallelize + tv2->axis(1)->parallelize(ParallelType::Unroll); + tv3->axis(1)->parallelize(ParallelType::Unroll); + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(0)->parallelize(ParallelType::BIDx); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + + at::Tensor input0 = at::rand({129, 13, 3}, options); + at::Tensor input1 = at::rand({129, 13, 3}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({input0, input1}); + + TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); +} + +/* + * Helper function for single op testing that generates a codegen operand + */ + +Val* gen_jit_operand(std::pair desc) { + if (desc.first == ValType::TensorView) { + return makeDummyTensor(2, desc.second); + } else if (desc.first == ValType::Scalar) { + if (desc.second == DataType::Float) + return new Float(); + else if (desc.second == DataType::Int) + return new Int(); + else + TORCH_CHECK("Not currently supported type", desc.first); + } else { + TORCH_CHECK("Not currently supported type", desc.first); + } + return nullptr; +} + +/* + * Helper function for single op testing that generates an ATen operand + */ + +IValue gen_aten_operand( + std::pair desc, + int blocks, + int threads, bool rand) { if (desc.first == ValType::TensorView) { if (desc.second == DataType::Float) { @@ -2012,7 +2491,7 @@ void test_op( gen_aten_operand(op, blocks, threads, /*rand*/ false).toTensor(); std::vector output_vect = {output}; cudaDeviceSynchronize(); - if (fusion.hasRNG()) + if (fusion.isStochastic()) at::manual_seed(0); torch::jit::fuser::cuda::FusionExecutor fe; @@ -2020,7 +2499,7 @@ void test_op( fe.runFusion(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); - if (fusion.hasRNG()) + if (fusion.isStochastic()) at::manual_seed(0); at::Tensor ref_output = af(aten_inputs); cudaDeviceSynchronize(); // This sync shouldn't be necessary; @@ -2054,12 +2533,8 @@ void test_op( op_str, " -- had a mismatch.", aten_inputs_to_str(), - "\nJIT: ", - output, - "\nREF: ", - ref_output, - "\nDIFF: ", - diff, + "\nABS MAX DIFF: ", + output.sub(ref_output).abs().max(), "\n"); } @@ -2088,7 +2563,7 @@ void test_op( std::make_index_sequence{}); } -void testGPU_FusionUnaryOps() { +TEST(NVFuserTest, FusionUnaryOps_CUDA) { using OpTuple = std::tuple; @@ -2162,7 +2637,7 @@ void testGPU_FusionUnaryOps() { std::make_tuple(std::make_pair(ValType::TensorView, DataType::Float))); } -void testGPU_FusionBinaryOps() { +TEST(NVFuserTest, FusionBinaryOps_CUDA) { using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&); using OpTuple = std::tuple; @@ -2262,7 +2737,7 @@ void testGPU_FusionBinaryOps() { std::make_pair(ValType::Scalar, DataType::Float))); } -void testGPU_FusionTernaryOps() { +TEST(NVFuserTest, FusionTernaryOps_CUDA) { test_op( /*blocks*/ 640, /*threads*/ 64, @@ -2311,7 +2786,7 @@ void testGPU_FusionTernaryOps() { std::make_pair(ValType::TensorView, DataType::Float))); } -void testGPU_FusionCompoundOps() { +TEST(NVFuserTest, FusionCompoundOps_CUDA) { test_op( /*blocks*/ 640, /*threads*/ 64, @@ -2350,7 +2825,7 @@ void testGPU_FusionCompoundOps() { std::make_pair(ValType::Scalar, DataType::Float))); } -void testGPU_FusionCastOps() { +TEST(NVFuserTest, FusionCastOps_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2385,20 +2860,14 @@ void testGPU_FusionCastOps() { "\nOp Type: -- ", "cast FP16->FP32->FP16", " -- had a mismatch.\n", - "IN1 : ", - input1, - "\n", - "JIT: ", - outputs[0], - "\n", - "REF: ", - ref_output, + "\nABS MAX DIFF: ", + outputs[0].sub(ref_output).abs().max(), "\n"); } // We want split/merge/reorder all tested both on and off rfactor domains, also // want compute at into the rfactor domain, and into its consumer -void testGPU_FusionRFactorReplay() { +TEST(NVFuserTest, FusionRFactorReplay_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2491,7 +2960,7 @@ void testGPU_FusionRFactorReplay() { // Start off simple, block on the outer dim // block stride + thread all reduce + unrolling on inner dim -void testGPU_FusionReduction() { +TEST(NVFuserTest, FusionReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2549,7 +3018,7 @@ void testGPU_FusionReduction() { TORCH_CHECK(aten_output.allclose(cg_output)); } -void testGPU_FusionReduction2() { +TEST(NVFuserTest, FusionReduction2_CUDA) { { Fusion fusion; FusionGuard fg(&fusion); @@ -2676,7 +3145,7 @@ void testGPU_FusionReduction2() { } } -void testGPU_FusionReduction3() { +TEST(NVFuserTest, FusionReduction3_CUDA) { { Fusion fusion; FusionGuard fg(&fusion); @@ -2747,7 +3216,7 @@ void testGPU_FusionReduction3() { } } -void testGPU_FusionReduction4() { +TEST(NVFuserTest, FusionReduction4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2799,7 +3268,7 @@ void testGPU_FusionReduction4() { aten_output.sub(cg_output).abs().max()); } -void testGPU_FusionReduction5() { +TEST(NVFuserTest, FusionReduction5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2860,7 +3329,7 @@ void testGPU_FusionReduction5() { TORCH_CHECK(aten_output.allclose(outputs[0])); } -void testGPU_FusionReductionTFT() { +TEST(NVFuserTest, FusionReductionTFT_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2917,7 +3386,7 @@ void testGPU_FusionReductionTFT() { TORCH_CHECK(aten_output.allclose(cg_output)); } -void testGPU_FusionBranches() { +TEST(NVFuserTest, FusionBranches_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2974,7 +3443,7 @@ void testGPU_FusionBranches() { TORCH_CHECK(t6.allclose(outputs[0])); } -void testGPU_FusionSimpleBCast() { +TEST(NVFuserTest, FusionSimpleBCast_CUDA) { { Fusion fusion; FusionGuard fg(&fusion); @@ -3238,7 +3707,7 @@ void testGPU_FusionSimpleBCast() { } } -void testGPU_FusionComplexBCast() { +TEST(NVFuserTest, FusionComplexBCast_CUDA) { { Fusion fusion; FusionGuard fg(&fusion); @@ -3341,7 +3810,7 @@ void testGPU_FusionComplexBCast() { } } -void testGPU_FusionAdvancedIndexing() { +TEST(NVFuserTest, FusionAdvancedIndexing_CUDA) { // Merging left to right is still broken in some instances. Indexing can't // complete because we assume we can simply traverse consumer->producer in the // index/extent map, but this case breaks this assumption. @@ -3453,10 +3922,6 @@ void testGPU_FusionAdvancedIndexing() { FusionGuard fg(&fusion); int w = 3, x = 4, y = 7, z = 8; - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - - at::Tensor t0 = at::randn({x, y, z}, options); - at::Tensor t1 = at::randn({w, x, y, z}, options); auto tv0 = makeDummyTensor(3); auto tv1 = makeDummyTensor(4); @@ -3465,10 +3930,42 @@ void testGPU_FusionAdvancedIndexing() { auto tv2 = add(tv0, new Float(1.0)); auto tv3 = add(tv2, tv1); - fusion.addOutput(tv3); - fuser::cuda::scheduleFusion(&fusion, {t0, t1}); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({x, y, z}, options); + at::Tensor t1 = at::randn({w, x, y, z}, options); + + fuser::cuda::scheduleFusion(&fusion, {t0, t1}); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + auto t2 = t0.add(1.0); + auto t3 = t2.add(t1); + + TORCH_CHECK(t3.allclose(outputs[0])); + } + + { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeConcreteTensor({10, 20}); + fusion.addInput(tv0); + TensorView* tv1 = makeConcreteTensor({10, 10, 20}); + fusion.addInput(tv1); + + TensorView* tv2 = add(tv0, new Float(1)); + TensorView* tv3 = broadcast(tv2, {true, false, false}); + TensorView* tv4 = add(tv3, tv1); + fusion.addOutput(tv4); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({10, 20}, options); + at::Tensor t1 = at::randn({10, 10, 20}, options); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); @@ -3482,7 +3979,7 @@ void testGPU_FusionAdvancedIndexing() { } // Test a simple Gemm but also play around with fusion executor features -void testGPU_FusionSimpleGemm() { +TEST(NVFuserTest, FusionSimpleGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3569,7 +4066,7 @@ void testGPU_FusionSimpleGemm() { } // Softmax with a 1D tensor. Parallelized only with a single thread block. -void testGPU_FusionSoftmax1D() { +TEST(NVFuserTest, FusionSoftmax1D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3626,7 +4123,7 @@ void testGPU_FusionSoftmax1D() { } // Softmax with a 1D tensor with input normalization. -void testGPU_FusionSoftmax1DNormalized() { +TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3697,7 +4194,7 @@ void testGPU_FusionSoftmax1DNormalized() { // Softmax with a 3D tensor, where the inner-most 3rd dimension is // normalized. Pallelized with multiple thread blocks. -void testGPU_FusionSoftmax3D() { +TEST(NVFuserTest, FusionSoftmax3D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3757,7 +4254,7 @@ void testGPU_FusionSoftmax3D() { } // Softmax with a 3D tensor with input normalization. -void testGPU_FusionSoftmax3DNormalized() { +TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3831,7 +4328,7 @@ void testGPU_FusionSoftmax3DNormalized() { t2.sub(outputs[0]).abs().max()); } -void testGPU_FusionSoftmaxComputeAt() { +TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3857,7 +4354,7 @@ void testGPU_FusionSoftmaxComputeAt() { } // Similar to FusionReduction but uses grid reduction -void testGPU_FusionGridReduction1() { +TEST(NVFuserTest, FusionGridReduction1_CUDA) { const int gdimx = 32; const int bdimx = 128; @@ -3915,7 +4412,7 @@ void testGPU_FusionGridReduction1() { } // Same test as the above but uses BIDy and TIDx for reduction -void testGPU_FusionGridReduction2() { +TEST(NVFuserTest, FusionGridReduction2_CUDA) { const int gdimy = 32; const int bdimx = 128; @@ -3970,7 +4467,7 @@ void testGPU_FusionGridReduction2() { } // Same test but uses BIDy and BIDz for reduction. No TID used. -void testGPU_FusionGridReduction3dim1() { +TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { const int gdimz = 32; const int gdimy = 128; @@ -4026,7 +4523,7 @@ void testGPU_FusionGridReduction3dim1() { } // Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0 -void testGPU_FusionGridReduction3dim0() { +TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { const int rdim = 0; const int gdimy = 128; const int gdimz = 32; @@ -4079,7 +4576,7 @@ void testGPU_FusionGridReduction3dim0() { } // This is similar to the FusionReduction, but swaps BIDx and TIDx -void testGPU_FusionGridReduction4() { +TEST(NVFuserTest, FusionGridReduction4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4142,7 +4639,7 @@ void testGPU_FusionGridReduction4() { // Grid reduction with 2D thread blocks but only TIDx and BIDx are // mapped to a reduction dim -void testGPU_FusionGridReduction5() { +TEST(NVFuserTest, FusionGridReduction5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4194,7 +4691,7 @@ void testGPU_FusionGridReduction5() { } // Similar to FusionGridReduction1 but with 3D tensors -void testGPU_FusionGridReduction6() { +TEST(NVFuserTest, FusionGridReduction6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4255,7 +4752,7 @@ void testGPU_FusionGridReduction6() { TORCH_CHECK(aten_output.allclose(cg_output)); } -void testGPU_FusionNonRedAxisBind() { +TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { int bid_x = 3; int tid_x = 2; int red_dim = 0; @@ -4290,7 +4787,7 @@ void testGPU_FusionNonRedAxisBind() { aten_output.sub(outputs[0]).abs().max()); } -void testGPU_FusionSplitBCast() { +TEST(NVFuserTest, FusionSplitBCast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4338,7 +4835,7 @@ void testGPU_FusionSplitBCast() { fe.runFusion({t0, t1}, {cg_output}); } -void testGPU_FusionBCastInnerDim() { +TEST(NVFuserTest, FusionBCastInnerDim_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4352,7 +4849,7 @@ void testGPU_FusionBCastInnerDim() { TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast()); } -void testGPU_FusionBCastReduce() { +TEST(NVFuserTest, FusionBCastReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4368,7 +4865,7 @@ void testGPU_FusionBCastReduce() { // Multiple consumer reduction with computeAt // https://github.com/csarofeen/pytorch/issues/110 -void testGPU_FusionReductionMultiConsumer() { +TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeDummyTensor(2); @@ -4385,7 +4882,7 @@ void testGPU_FusionReductionMultiConsumer() { tv1->getThisComputeAtAxis() == 2 && tv1->getRelativeComputeAtAxis() == 2); } -void testGPU_FusionComputeAtExprOrder() { +TEST(NVFuserTest, FusionComputeAtExprOrder_CUDA) { { for (int i = 0; i < 2; ++i) { Fusion fusion; @@ -4455,7 +4952,7 @@ void testGPU_FusionComputeAtExprOrder() { } } -void testGPU_FusionZeroDimComputeAt() { +TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4482,7 +4979,7 @@ void testGPU_FusionZeroDimComputeAt() { aten_output.sub(outputs[0]).abs().max()); } -void testGPU_FusionZeroDimBroadcast() { +TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4518,7 +5015,7 @@ void testGPU_FusionZeroDimBroadcast() { aten_output.sub(output).abs().max()); } -void testGPU_FusionZeroDimReduction() { +TEST(NVFuserTest, FusionZeroDimReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4555,7 +5052,7 @@ void testGPU_FusionZeroDimReduction() { aten_output.sub(output).abs().max()); } -void testGPU_FusionBCastAfterReduce() { +TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 128; @@ -4606,7 +5103,7 @@ void testGPU_FusionBCastAfterReduce() { TORCH_CHECK(t5.allclose(outputs[0], 1e-5, 1e-5)); } -void testGPU_FusionReductionScheduler() { +TEST(NVFuserTest, FusionReductionScheduler_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; @@ -4624,29 +5121,27 @@ void testGPU_FusionReductionScheduler() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand({bid_x, tid_x}, options); + at::Tensor input = at::randn({bid_x, tid_x}, options); // Apply reduction heuristic - const at::ArrayRef inputs({input}); - - TORCH_CHECK( - cuda::scheduleReduction(&fusion, inputs, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); cuda::FusionExecutor fe; fe.compileFusion(&fusion); // no broadcasting needed, omitting the last optional argument; - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum({red_dim}); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } // Simple reduction parallelized on a symbolic size. -void testGPU_FusionSymbolicReduction() { +TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4685,9 +5180,9 @@ void testGPU_FusionSymbolicReduction() { // How many threads to use for the block reduction int runtime_threadIdx_dim = 128; - torch::jit::fuser::cuda::FusionExecutor executor; - executor.compileFusion(&fusion); - auto outputs = executor.runFusion( + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( {input}, torch::jit::fuser::cuda::LaunchParams( -1, -1, -1, runtime_threadIdx_dim, -1, -1)); @@ -4696,7 +5191,7 @@ void testGPU_FusionSymbolicReduction() { TORCH_CHECK(aten_output.allclose(outputs[0])); } -void testGPU_FusionReductionSchedulerMultiDimNonFastest() { +TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { const std::vector red_dims = {0, 2}; // Copy is because CodeGen requires int and Pytorch requires int64_t // for a vector of reduction dimensions @@ -4716,29 +5211,27 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); at::Tensor cg_output = at::empty(tensor_dims_out, options); // Apply reduction heuristic - const at::ArrayRef inputs({input}); - - TORCH_CHECK( - cuda::scheduleReduction(&fusion, inputs, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-04, 1e-04), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } -void testGPU_FusionReductionSchedulerMultiDimFastest() { +TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { const std::vector red_dims = {1, 3}; // Copy is because CodeGen requires int and Pytorch requires int64_t // for a vector of reduction dimensions @@ -4758,26 +5251,26 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() { const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor input = at::rand(tensor_dims_in, options); + at::Tensor input = at::randn(tensor_dims_in, options); - TORCH_CHECK( - cuda::scheduleReduction(&fusion, {input}, tv1), - "Reduction schedule was not generated!"); + auto reduction_params = cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction(&fusion, reduction_params.value(), tv1, {}); torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({input}); + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum(red_dims64); TORCH_CHECK( - aten_output.allclose(outputs[0]), + aten_output.allclose(outputs[0], 1e-05, 1e-05), "Error of: ", aten_output.sub(outputs[0]).abs().max()); } -void testGPU_FusionReductionSchedulerDimShmoo() { - std::vector fp16_usage = {false}; +TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { + std::vector fp16_usage = {true, false}; std::vector red_axis = {1, 0}; std::vector output_dims = {320, 640}; std::vector red_dims; @@ -4821,47 +5314,38 @@ void testGPU_FusionReductionSchedulerDimShmoo() { .dtype((fp16 ? at::kHalf : at::kFloat)) .device(at::kCUDA, 0); at::Tensor input = - (axis ? at::rand({odim, rdim}, options) - : at::rand({rdim, odim}, options)); - - const at::ArrayRef inputs({input}); + (axis ? at::randn({odim, rdim}, options) + : at::randn({rdim, odim}, options)); - c10::optional rparams = - cuda::scheduleReduction(&fusion, inputs, tv1); - TORCH_CHECK(rparams != c10::nullopt, "Reduction is not found!"); + std::vector outputs_of_red; if (fp16) { - if (axis == 0) { - int tidx = rparams.value().lparams.bdimx(); - tv1_cast->split(-1, tidx); - tv1_cast->axis(-1)->parallelize(ParallelType::TIDx); - tv1_cast->axis(-2)->parallelize(ParallelType::BIDx); - } else { - if (rparams.value().mul_reds_per_blk) { - int tidy = rparams.value().lparams.bdimy(); - tv1_cast->split(0, tidy); - tv1_cast->axis(-1)->parallelize(ParallelType::TIDy); - } - tv1_cast->axis(0)->parallelize(ParallelType::BIDx); - } + outputs_of_red.push_back(tv1_cast); } + auto reduction_params = + cuda::getReductionHeuristics(&fusion, {input}, tv1); + TORCH_CHECK(reduction_params.has_value(), "Reduction is not found!"); + cuda::scheduleReduction( + &fusion, reduction_params.value(), tv1, outputs_of_red); + torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto cg_output = fe.runFusion({input}); + auto outputs = + fe.runFusion({input}, reduction_params.value().lparams); auto aten_output = input.sum({axis}); TORCH_CHECK( - aten_output.allclose(cg_output[0]), + aten_output.allclose(outputs[0], 1e-03, 1e-03), "Error of: ", - aten_output.sub(cg_output[0]).abs().max()); + aten_output.sub(outputs[0]).abs().max()); } } } } } -void testGPU_FusionCacheBefore() { +TEST(NVFuserTest, FusionCacheBefore_CUDA) { // TVM Cache Write Fusion fusion; FusionGuard fg(&fusion); @@ -4902,7 +5386,7 @@ void testGPU_FusionCacheBefore() { aten_output.sub(outputs[0]).abs().sum()); } -void testGPU_FusionCacheAfter() { +TEST(NVFuserTest, FusionCacheAfter_CUDA) { // TVM Cache Read Fusion fusion; FusionGuard fg(&fusion); @@ -4943,7 +5427,7 @@ void testGPU_FusionCacheAfter() { aten_output.sub(outputs[0]).abs().sum()); } -void testGPU_FusionCacheIndirect() { +TEST(NVFuserTest, FusionCacheIndirect_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4992,7 +5476,7 @@ void testGPU_FusionCacheIndirect() { aten_output.sub(outputs[0]).abs().sum()); } -void testGPU_FusionCacheBcast() { +TEST(NVFuserTest, FusionCacheBcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5051,7 +5535,7 @@ void testGPU_FusionCacheBcast() { aten_output.sub(outputs[0]).abs().max()); } -void testGPU_FusionCacheComplex() { +TEST(NVFuserTest, FusionCacheComplex_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5106,7 +5590,7 @@ void testGPU_FusionCacheComplex() { aten_output.sub(outputs[0]).abs().sum()); } -void testGPU_FusionCacheMultiConsumer() { +TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5151,7 +5635,7 @@ void testGPU_FusionCacheMultiConsumer() { aten_output.sub(outputs[1]).abs().sum()); } -void testGPU_FusionSmem() { +TEST(NVFuserTest, FusionSmem_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5203,16 +5687,269 @@ void testGPU_FusionSmem() { aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +TEST(NVFuserTest, FusionSmemReduce_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeDummyTensor(3); // M, K, N + TensorView* tv1 = sum(tv0, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addOutput(tv1); + + TensorView* tv2 = tv0->cache_after(); + tv2->setMemoryType(MemoryType::Shared); + + // Schedule + constexpr int BSX = 32; + tv1->split(2, BSX); + tv1->split(1, 128); + tv1->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); + TensorView* tv3 = tv1->rFactor({-2}); + + tv0->computeAt(tv1, -2); + tv0->computeAt(tv3, -2); + + // Thread and Block binding + tv1->axis(0)->parallelize(ParallelType::BIDx); + tv1->axis(1)->parallelize(ParallelType::BIDy); + tv1->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0}); + + at::Tensor aten_output = sum(t0, {1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1); +} + +TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // Schedule + constexpr int BSX = 16; + tv5->split(2, BSX); + tv5->split(1, BSX); + tv5->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); + // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX + TensorView* tv6 = tv5->rFactor({-1}); + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + tv4->setMemoryType(MemoryType::Shared); + tv6->setMemoryType(MemoryType::Shared); + + tv0->computeAt(tv5, 3); + tv1->computeAt(tv5, 3); + + // Thread and Block binding + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + tv6->axis(-3)->parallelize(ParallelType::TIDy); + tv6->axis(-2)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + at::Tensor aten_output = matmul(t0, t1); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); } -void testGPU_FusionSmemReduce() { +TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm + TensorView* tv0 = makeDummyTensor(2); // (M, K) + TensorView* tv1 = makeDummyTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + // Schedule + // Remove reduction axis from tv5 + // tv6 = (M, R, N) + // tv5 = (M, N) + TensorView* tv6 = tv5->cache_before(); + + constexpr int BSX = 16; + tv5->split(1, BSX); + tv5->split(0, BSX); + // M/BSX, BSX, N/BSX, BSX + tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); + // tv5 = M/BSX, N/BSX, MSX, NSX + + tv6->computeAt(tv5, 2); + tv6->computeAt(tv5, 2); + + tv6->split(-1, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX + TensorView* tv7 = tv6->rFactor({-1}); + // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr + // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX + + tv0->computeAt(tv6, 3); + tv1->computeAt(tv6, 3); + + tv0->computeAt(tv7, 3); + tv1->computeAt(tv7, 3); + + tv2->setMemoryType(MemoryType::Shared); + tv3->setMemoryType(MemoryType::Shared); + tv4->setMemoryType(MemoryType::Shared); + tv6->setMemoryType(MemoryType::Shared); + tv7->setMemoryType(MemoryType::Shared); + // Memory Type + + // Thread and Block binding + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + // Manual Binding + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + + tv7->axis(-3)->parallelize(ParallelType::TIDy); + tv7->axis(-2)->parallelize(ParallelType::TIDx); + + tv6->axis(-2)->parallelize(ParallelType::TIDy); + tv6->axis(-1)->parallelize(ParallelType::TIDx); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({t0, t1}); + + at::Tensor aten_output = matmul(t0, t1); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Shared); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 0); +} + +TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + Int* sym_bsx = new Int(); TensorView* tv0 = makeDummyTensor(3); // M, K, N + fusion.addInput(tv0); + fusion.addInput(sym_bsx); + TensorView* tv1 = sum(tv0, {1}); // M, R, N - fusion.addInput(tv0); fusion.addOutput(tv1); TensorView* tv2 = tv0->cache_after(); @@ -5221,7 +5958,7 @@ void testGPU_FusionSmemReduce() { // Schedule constexpr int BSX = 32; tv1->split(2, BSX); - tv1->split(1, 128); + tv1->split(1, sym_bsx); tv1->split(0, BSX); // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX tv1->reorder({{0, 0}, {1, 2}, {2, 4}, {3, 5}, {4, 1}, {5, 3}}); @@ -5243,63 +5980,64 @@ void testGPU_FusionSmemReduce() { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K, N}, options); + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}); + auto outputs = fe.runFusion( + {t0, runtime_threadIdx_dim}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); at::Tensor aten_output = sum(t0, {1}); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(24) == 1); } -void testGPU_FusionSmemBlockGemm() { +TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - // Algorithm + Int* sym_bsx = new Int(); TensorView* tv0 = makeDummyTensor(2); // (M, K) TensorView* tv1 = makeDummyTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N fusion.addInput(tv0); fusion.addInput(tv1); - fusion.addOutput(tv5); - - // Schedule - constexpr int BSX = 16; - tv5->split(2, BSX); - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 3}, {2, 2}, {3, 5}, {4, 1}, {5, 4}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv6 = tv5->rFactor({-1}); + fusion.addInput(sym_bsx); + fusion.addOutput(tv4); + // Algorithm tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv0->computeAt(tv5, 3); - tv1->computeAt(tv5, 3); + constexpr int BSX = 32; + tv4->split(2, BSX); + tv4->split(1, sym_bsx); + tv4->split(0, BSX); + // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX + tv4->reorder({{0, 0}, {1, 3}, {2, 1}, {3, 4}, {4, 2}, {5, 5}}); + // M/BSX, K/BSX, N/BSX, MSX, KSX, NSX - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); - tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); + tv0->computeAt(tv4, 3); + tv1->computeAt(tv4, 3); + // Schedule + + tv4->axis(0)->parallelize(ParallelType::BIDx); + tv4->axis(2)->parallelize(ParallelType::BIDy); // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(-2)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv6->axis(-3)->parallelize(ParallelType::TIDy); - tv6->axis(-2)->parallelize(ParallelType::TIDx); + // Thread and Block binding - constexpr int M = 154, K = 45, N = 1524; + constexpr int M = 128, K = 457, N = 1024; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({M, K}, options); @@ -5307,103 +6045,234 @@ void testGPU_FusionSmemBlockGemm() { torch::jit::fuser::cuda::FusionExecutor fe; fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0, t1}); + auto outputs = fe.runFusion( + {t0, t1, BSX}, + torch::jit::fuser::cuda::LaunchParams(-1, -1, -1, BSX, -1, -1)); - at::Tensor aten_output = matmul(t0, t1); + at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(22) == 1); } -void testGPU_FusionSmemBlockGemmCache() { -#if 0 +TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - // Algorithm - TensorView* tv0 = makeDummyTensor(2); // (M, K) - TensorView* tv1 = makeDummyTensor(2); // (K, N) - TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) - TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) - TensorView* tv4 = mul(tv2, tv3); // M, K, N - TensorView* tv5 = sum(tv4, {1}); // M, R, N + // Symbolic integers we will use for runtime tiling + Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z + Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x + Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x + // Compile-time integer for tiling + int n_smem_tile = 8; // bound to threadIdx.y + + // Symbolic 2D tensors TV0[M, K], TV1[K, N] + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + + // Broadcast tv0 to [M, K, *] + TensorView* tv2 = broadcast(tv0, {false, false, true}); + // Broadcast tv1 to [*, K, N] + TensorView* tv3 = broadcast(tv1, {true, false, false}); + + // Pointwise multiplication resulting in tv3[M, K, N] + TensorView* tv4 = mul(tv2, tv3); + + // Turn the K-dimension of tv4 into a reduction dimension + TensorView* tv5 = sum(tv4, {1}); + + // Register inputs and outputs fusion.addInput(tv0); fusion.addInput(tv1); fusion.addOutput(tv5); - // Schedule - // Remove reduction axis from tv5 - // tv6 = (M, R, N) - // tv5 = (M, N) - TensorView* tv6 = tv5->cache_before(); + // Register runtime tile dims as inputs + fusion.addInput(symbolic_m_tile_dim); + fusion.addInput(symbolic_split_k_tile_dim); + fusion.addInput(symbolic_block_k_tile_dim); - constexpr int BSX = 16; - tv5->split(1, BSX); - tv5->split(0, BSX); - // M/BSX, BSX, N/BSX, BSX - tv5->reorder({{0, 0}, {1, 2}, {2, 1}, {3, 3}}); - // tv5 = M/BSX, N/BSX, MSX, NSX + // Make a 3D tile, mix of symbolic and constant, do in reverse order because + // dims are inserted + tv5->split(2, n_smem_tile); + tv5->split(1, symbolic_block_k_tile_dim); + tv5->split(1, symbolic_split_k_tile_dim); + tv5->split(0, symbolic_m_tile_dim); - tv6->computeAt(tv5, 2); + // Reorder so all outer tiles are in the leftmost 3 positions + tv5->reorder({{1, 5}, {5, 1}}); + + // Factor out the outer reduction IterDomain, then run the inter-cta + // reduction, and intra-cta reduction + auto tv6 = tv5->rFactor({2}); + + // Scope computations tv6->computeAt(tv5, 2); - tv6->split(-1, BSX); - // M/BSX, BSX, K/BSX, BSX, N/BSX, BSX - tv6->reorder({{0, 0}, {1, 1}, {2, 3}, {3, 4}, {4, 2}, {5, 5}}); - // M/BSX, N/BSX, K/BSX, MSX, NSX, KSX - TensorView* tv7 = tv6->rFactor({-1}); - // tv7 = M/BSX, N/BSX, K/BSXrf, MSX, NSX, KSXr - // tv6 = M/BSX, N/BSX, K/BSXr, MSX, NSX + // RFactor moves reduction axes around, reorder to match ordering of tv5 + tv6->reorder({ + {2, -2}, + {3, -1}, + {4, 2}, + {5, 3}, + {6, 4}, + }); + // Setup compute at schedule tv0->computeAt(tv6, 3); tv1->computeAt(tv6, 3); + tv4->computeAt(tv6, -1); + // + // T2[Mo, bNo, Koo, Koi, Kii, Mi, bNi] CA(4, 3) + // T3[bMo, No, Koo, Koi, Kii, bMi, Ni] CA(4, 3) + // T4[ Mo, No, Koo, Koi, Kii, Mi, Ni] + // T6[ Mo, No, rKoo, Koi, Kii, Mi, Ni] + // T5[ Mo, No, rKoi, rKii, Mi, Ni] - tv0->computeAt(tv7, 3); - tv1->computeAt(tv7, 3); - + // Cache smem tiles tv2->setMemoryType(MemoryType::Shared); tv3->setMemoryType(MemoryType::Shared); - tv4->setMemoryType(MemoryType::Shared); - tv6->setMemoryType(MemoryType::Shared); - tv7->setMemoryType(MemoryType::Shared); - // Memory Type + tv4->setMemoryType(MemoryType::Local); + tv6->setMemoryType(MemoryType::Local); - // Thread and Block binding - tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(0)->parallelize(ParallelType::BIDz); tv5->axis(1)->parallelize(ParallelType::BIDy); - tv5->axis(-2)->parallelize(ParallelType::TIDy); - tv5->axis(-1)->parallelize(ParallelType::TIDx); - // Manual Binding - tv2->axis(-1)->parallelize(ParallelType::TIDx); - tv3->axis(-1)->parallelize(ParallelType::TIDx); - tv4->axis(-1)->parallelize(ParallelType::TIDx); - tv7->axis(-3)->parallelize(ParallelType::TIDy); - tv7->axis(-2)->parallelize(ParallelType::TIDx); + std::vector tv_list = {tv2, tv3, tv4, tv5, tv6}; + for (auto tv : tv_list) { + tv->axis(-2)->parallelize(ParallelType::TIDz); + tv->axis(-1)->parallelize(ParallelType::TIDy); + } + tv2->axis(3)->parallelize(ParallelType::TIDx); + tv3->axis(3)->parallelize(ParallelType::TIDx); + tv4->axis(3)->parallelize(ParallelType::TIDx); + tv6->axis(3)->parallelize(ParallelType::TIDx); + tv5->axis(2)->parallelize(ParallelType::TIDx); - tv6->axis(-2)->parallelize(ParallelType::TIDy); - tv6->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(4)->parallelize(ParallelType::BIDx); + tv3->axis(4)->parallelize(ParallelType::BIDx); + tv4->axis(4)->parallelize(ParallelType::BIDx); + tv6->axis(4)->parallelize(ParallelType::BIDx); + tv5->axis(3)->parallelize(ParallelType::BIDx); - constexpr int M = 154, K = 45, N = 1524; + constexpr int M = 31, K = 65, N = 33; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor t0 = at::randn({M, K}, options); - at::Tensor t1 = at::randn({K, N}, options); + at::Tensor A = at::randn({M, K}, options); + at::Tensor B = at::randn({K, N}, options); torch::jit::fuser::cuda::FusionExecutor fe; + // Generate CUDA and compile with nvRTC fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0, t1}); - at::Tensor aten_output = matmul(t0, t1); + // Runtime tiling + int m_tile = 4; // bound to threadIdx.z + int split_k = 7; // bound to blockIdx.x + int intra_cta = 8; // bound to threadIdx.x + + auto fuser_outputs = fe.runFusion({A, B, m_tile, split_k, intra_cta}); + auto C_fuser = fuser_outputs[0]; + + at::Tensor aten_C = mul(A.unsqueeze(2), B.unsqueeze(0)).sum(1); + TORCH_CHECK( + aten_C.allclose(C_fuser, 1e-5, 1e-5), + "Error of: ", + aten_C.sub(C_fuser).abs().max()); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.size() == 1); + TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs.count(41) == 1); +} + +TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Float(0), tv0); + fusion.addInput(tv0); + fusion.addOutput(tv1); + // tv1[I0, R1] = tv0[I0, I1] + + // Interface should just be a direct split with a Parallel type. We can + // include the parallelize call if we do this. + tv1->split(1, NamedScalar::getParallelDim(ParallelType::TIDx)); + // tv1[I0, R1o, R1i{BIDx}] = tv0[I0, I1] + + TensorView* tv2 = tv1->rFactor({2}); + tv2->setMemoryType(MemoryType::Global); + // tv2[I0, R1oo, Ir1i{BIDx}] = tv0[I0, I1] + // tv1[I0, R1i{BIDx}] = tv2[I0, R1oo, Ir1i{BIDx}] + + tv0->computeAt(tv1, 1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv1->axis(0)->parallelize(ParallelType::BIDx); + + constexpr int numel_x = 65000, numel_y = 1024; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::rand({numel_x, numel_y}, options); + + // How many threads to use for the block reduction + constexpr int runtime_threadIdx_dim = 128; + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion( + {input}, + torch::jit::fuser::cuda::LaunchParams( + -1, -1, -1, runtime_threadIdx_dim, -1, -1)); + + auto aten_output = input.sum({1}); TORCH_CHECK( aten_output.allclose(outputs[0], 1e-5, 1e-5), "Error of: ", aten_output.sub(outputs[0]).abs().max()); -#endif } -void testGPU_FusionConstCheck() { +TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tv0 = makeDummyTensor(2); + TensorView* tv1 = makeDummyTensor(2); + TensorView* tv2 = makeDummyTensor(2); + TensorView* tv3 = makeDummyTensor(2); + TensorView* tv4 = sub(tv2, tv3); + TensorView* tv5 = add(tv1, tv4); + TensorView* tv6 = sub(tv5, tv0); + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addInput(tv2); + fusion.addInput(tv3); + fusion.addOutput(tv6); + // t6 = ((t1 + (t2 - t3)) - t0) + + tv4->setMemoryType(MemoryType::Global); + tv5->setMemoryType(MemoryType::Global); + tv6->setMemoryType(MemoryType::Global); + + constexpr int M = 32, N = 810; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor in0 = at::rand({M, N}, options); + at::Tensor in1 = at::rand({M, N}, options); + at::Tensor in2 = at::rand({M, N}, options); + at::Tensor in3 = at::rand({M, N}, options); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion({in0, in1, in2, in3}); + + at::Tensor aten_output = (in1 + (in2 - in3)) - in0; + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-5, 1e-5), + "Error of: ", + aten_output.sub(outputs[0]).abs().sum()); +} + +TEST(NVFuserTest, FusionConstCheck_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5420,7 +6289,7 @@ void testGPU_FusionConstCheck() { TORCH_CHECK(one_x4->isConstScalar()); } -void testGPU_FusionUnrollWithAlloc() { +TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { const std::vector tensor_dims_in = {128, 128}; Fusion fusion; FusionGuard fg(&fusion); @@ -5468,7 +6337,7 @@ void testGPU_FusionUnrollWithAlloc() { } // Test isZeroInt -void testGPU_FusionIsZeroInt() { +TEST(NVFuserTest, FusionIsZeroInt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5481,7 +6350,7 @@ void testGPU_FusionIsZeroInt() { } // Test isOneInt -void testGPU_FusionIsOneInt() { +TEST(NVFuserTest, FusionIsOneInt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5496,7 +6365,7 @@ void testGPU_FusionIsOneInt() { // This is to verify no cycle of computeAt is created. A more complex // variation of this pattern appears in one of the Python tests // (test_random_topo). -void testGPU_FusionComputeAtNonterminatingOutput() { +TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5560,7 +6429,7 @@ void testGPU_FusionComputeAtNonterminatingOutput() { return; } -void testGPU_FusionTraversalOrder1() { +TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5608,7 +6477,7 @@ void testGPU_FusionTraversalOrder1() { t4.sub(cg_output_tv4).abs().max()); } -void testGPU_FusionTraversalOrder2() { +TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5661,7 +6530,7 @@ void testGPU_FusionTraversalOrder2() { t5.sub(cg_output_tv5).abs().max()); } -void testGPU_FusionTraversalOrder3() { +TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { for (int i = 0; i < 2; ++i) { Fusion fusion; FusionGuard fg(&fusion); @@ -5729,7 +6598,7 @@ void testGPU_FusionTraversalOrder3() { } } -void testGPU_FusionTraversalOrder4() { +TEST(NVFuserTest, FusionTraversalOrder4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5793,7 +6662,7 @@ void testGPU_FusionTraversalOrder4() { t7.sub(cg_output_tv7).abs().max()); } -void testGPU_FusionTraversalOrder5() { +TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5843,7 +6712,7 @@ void testGPU_FusionTraversalOrder5() { t5.sub(cg_output_tv5).abs().max()); } -void testGPU_FusionTraversalOrder6() { +TEST(NVFuserTest, FusionTraversalOrder6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5885,7 +6754,7 @@ void testGPU_FusionTraversalOrder6() { t4.sub(cg_output_tv4).abs().max()); } -void testGPU_FusionTraversalOrder7() { +TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5934,7 +6803,7 @@ void testGPU_FusionTraversalOrder7() { } // Test predication of grid reduction -void testGPU_FusionThreadPredicate() { +TEST(NVFuserTest, FusionThreadPredicate_CUDA) { const int gdimx = 4; const int bdimx = 128; @@ -5990,6 +6859,195 @@ void testGPU_FusionThreadPredicate() { TORCH_CHECK(aten_output_tv3.allclose(cg_output_tv3)); } +TEST(NVFuserTest, FusionLSTMCell_CUDA) { + const int hidden_features = 512; + const int batch_size = 64; + + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* tvs[16]; + for (size_t i = 0; i < 16; i++) { + tvs[i] = makeDummyTensor(2); + fusion.addInput(tvs[i]); + } + + auto ingate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3])); + + auto forgetgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7])); + + auto cellgate = unaryOp( + UnaryOpType::Tanh, add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11])); + + auto outgate = unaryOp( + UnaryOpType::Sigmoid, add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15])); + + auto cx = makeContigTensor(2); + fusion.addInput(cx); + + auto cy = add(mul(forgetgate, cx), mul(ingate, cellgate)); + + auto hy = mul(outgate, unaryOp(UnaryOpType::Tanh, cy)); + + fusion.addOutput(cy); + fusion.addOutput(hy); + + std::vector inputs; + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor large_tensor0 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor1 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor2 = + at::randn({batch_size, hidden_features * 4}, options); + at::Tensor large_tensor3 = + at::randn({batch_size, hidden_features * 4}, options); + + auto chunked0 = large_tensor0.chunk(4, 1); + auto chunked1 = large_tensor1.chunk(4, 1); + auto chunked2 = large_tensor2.chunk(4, 1); + auto chunked3 = large_tensor3.chunk(4, 1); + + inputs.insert(inputs.end(), chunked0.begin(), chunked0.end()); + inputs.insert(inputs.end(), chunked1.begin(), chunked1.end()); + inputs.insert(inputs.end(), chunked2.begin(), chunked2.end()); + inputs.insert(inputs.end(), chunked3.begin(), chunked3.end()); + + auto at_ingate = + chunked0[0].add(chunked0[1]).add(chunked0[2]).add(chunked0[3]).sigmoid(); + auto at_forgetgate = + chunked1[0].add(chunked1[1]).add(chunked1[2]).add(chunked1[3]).sigmoid(); + auto at_cellgate = + chunked2[0].add(chunked2[1]).add(chunked2[2]).add(chunked2[3]).tanh(); + auto at_outgate = + chunked3[0].add(chunked3[1]).add(chunked3[2]).add(chunked3[3]).sigmoid(); + + auto at_cx = at::randn({batch_size, hidden_features}, options); + inputs.push_back(at_cx); + auto at_cy = at_forgetgate.mul(at_cx).add(at_ingate.mul(at_cellgate)); + auto at_hy = at_outgate.mul(at_cy.tanh()); + + fuser::cuda::scheduleFusion(&fusion, c10::ArrayRef(inputs)); + + torch::jit::fuser::cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + auto outputs = fe.runFusion(c10::ArrayRef(inputs)); + + TORCH_CHECK(at_cy.allclose(outputs[0], 1e-4, 1e-7)); + TORCH_CHECK(at_hy.allclose(outputs[1], 1e-4, 1e-7)); +} + +TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(1); + fusion.addInput(tv0); + + TensorView* tv1 = mul(tv0, new Float(0.5)); + TensorView* tv2 = broadcast(tv1, {true, false}); + TensorView* tv3 = broadcast(tv1, {false, true}); + TensorView* tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + // This is not supported and should throw an exception. + ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); +} + +TEST(NVFuserTest, FusionReductionHalf_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Set up your input tensor views + TensorView* tv0 = makeDummyTensor(3, DataType::Half); + fusion.addInput(tv0); + + auto tv1 = castOp(DataType::Float, tv0); + auto tv2 = add(tv1, new Float(1.0)); + auto tv3 = sum(tv2, {2}); + auto tv4 = castOp(DataType::Half, tv3); + + fusion.addOutput(tv4); + + const auto options = + at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0); + at::Tensor input = at::randn({8, 8, 16}, options); + + auto reduction_tv = tv3; + + auto outputsOfReduction = DependencyCheck::getAllOutputsOf({reduction_tv}); + + // Grab only tensor views, though there shouldn't be any other type + auto tv_entries = ir_utils::filterByType(outputsOfReduction); + + std::vector tvOutputsOfReduction( + tv_entries.begin(), tv_entries.end()); + + auto reduction_params = + cuda::getReductionHeuristics(&fusion, {input}, reduction_tv); + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + cuda::scheduleReduction( + &fusion, reduction_params.value(), reduction_tv, tvOutputsOfReduction); + + TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); + + cuda::FusionExecutor fe; + fe.compileFusion(&fusion); + // no broadcasting needed, omitting the last optional argument; + auto outputs = fe.runFusion({input}, reduction_params.value().lparams); + + auto aten_output = input.to(c10::ScalarType::Float) + .add(1.0) + .sum({2}) + .to(c10::ScalarType::Half); + + TORCH_CHECK( + aten_output.allclose(outputs[0], 1e-04, 1e-04), + "Error of: ", + aten_output.sub(outputs[0]).abs().max()); +} + +TEST(NVFuserTest, FusionInputsIdLookup_CUDA) { + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({16, 8, 8}, options); + at::Tensor t1 = at::randn({8, 8}, options); + at::Tensor t2 = at::randn({6, 4}, options); + + // create a cache with max size 2; + auto inputs_id_lookup = torch::jit::fuser::cuda::InputsIdLookup(2); + + // testing basic function, same encoding for identical inputs + auto id_0 = inputs_id_lookup.lookupId({t0, t1, 5.0}); + auto id_0_lookup = inputs_id_lookup.lookupId({t0, t1, 2.5}); + TORCH_CHECK(id_0.id == id_0_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 1); + TORCH_CHECK(id_0.eviction == false); + + // new input (even tho same shape, but we have different signature because of + // missing scalar input + auto id_1 = inputs_id_lookup.lookupId({t0, t1}); + auto id_1_lookup = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1.id == id_1_lookup.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_1.eviction == false); + + // eviction should happen at this point + auto id_2 = inputs_id_lookup.lookupId({t2, t1}); + TORCH_CHECK(id_2.id != id_0.id); + TORCH_CHECK(id_2.id != id_1.id); + TORCH_CHECK(inputs_id_lookup.size() == 2); + TORCH_CHECK(id_2.eviction == true); + TORCH_CHECK(id_2.evict_id == id_0.id); + + // look at input 1 again + auto id_1_relook = inputs_id_lookup.lookupId({t0, t1}); + TORCH_CHECK(id_1_relook.id == id_1.id); + TORCH_CHECK(id_1_relook.eviction == false); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_graph_executor.cpp b/test/cpp/jit/test_graph_executor.cpp index 992cde217a90..923e3421738b 100644 --- a/test/cpp/jit/test_graph_executor.cpp +++ b/test/cpp/jit/test_graph_executor.cpp @@ -1,11 +1,12 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/runtime/graph_executor.h" namespace torch { namespace jit { -void testGraphExecutor() { +TEST(GraphExecutorTest, Basic_CUDA) { constexpr int batch_size = 4; constexpr int input_size = 256; diff --git a/test/cpp/jit/test_inliner.cpp b/test/cpp/jit/test_inliner.cpp index 2153a0389319..702f5bd97573 100644 --- a/test/cpp/jit/test_inliner.cpp +++ b/test/cpp/jit/test_inliner.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include @@ -36,18 +36,16 @@ struct InlinerGuard { bool oldState_; }; -void testInliner() { - { - // disable automatic inlining so we can test it manually - InlinerGuard guard(/*shouldInline=*/false); +TEST(InlinerTest, Basic) { + // disable automatic inlining so we can test it manually + InlinerGuard guard(/*shouldInline=*/false); - CompilationUnit cu(testSource); - auto& fn = cu.get_function("foo3"); + CompilationUnit cu(testSource); + auto& fn = cu.get_function("foo3"); - auto g = fn.graph(); - Inline(*g); - FileCheck().check_count("prim::Print", 3)->run(*g); - } + auto g = fn.graph(); + Inline(*g); + FileCheck().check_count("prim::Print", 3)->run(*g); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_interface.cpp b/test/cpp/jit/test_interface.cpp index b256e2328ceb..04a532459426 100644 --- a/test/cpp/jit/test_interface.cpp +++ b/test/cpp/jit/test_interface.cpp @@ -1,5 +1,5 @@ +#include -#include #include #include @@ -44,7 +44,7 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } -void testModuleInterfaceSerialization() { +TEST(InterfaceTest, ModuleInterfaceSerialization) { auto cu = std::make_shared(); Module parentMod("parentMod", cu); Module subMod("subMod", cu); diff --git a/test/cpp/jit/test_interpreter.cpp b/test/cpp/jit/test_interpreter.cpp index 5977b0c0494a..da4607d7f047 100644 --- a/test/cpp/jit/test_interpreter.cpp +++ b/test/cpp/jit/test_interpreter.cpp @@ -1,12 +1,18 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" -#include namespace torch { namespace jit { -void testTypeCheck() { - { +class TypeCheckTest : public ::testing::Test { + protected: + TypeCheckTest() : interp(makeInterp()) {} + + InterpreterState interp; + + private: + static InterpreterState makeInterp() { auto graph = std::make_shared(); std::unordered_map vmap; parseIR( @@ -20,88 +26,97 @@ graph(%a.1 : Tensor, vmap); Code function(graph, ""); - InterpreterState interp(function); - { - // TypeCheck yields to true! Shape, grad and device matches. - auto a = at::zeros({2, 2}, at::kFloat); - auto b = at::ones({3, 3}, at::kFloat); - a.set_requires_grad(true); - a = a.to(at::kCPU); - std::vector stack({a, b}); - interp.run(stack); - ASSERT_TRUE(exactlyEqual(stack[0].toTensor(), a)); - ASSERT_TRUE(exactlyEqual(stack[1].toTensor(), b)); - ASSERT_TRUE(stack[2].toBool()); - } - { - auto a = at::zeros({2, 2}, at::kFloat); - auto b = at::ones({2, 2}, at::kFloat); // Size mismatch - a.set_requires_grad(true); - a = a.to(at::kCPU); - std::vector stack({a, b}); - interp.run(stack); - ASSERT_FALSE(stack[2].toBool()); - } - { - auto a = at::zeros({2, 2}, at::kFloat); - auto b = at::ones({3, 3}, at::kFloat); - a = a.to(at::kCPU); - a.set_requires_grad(false); // Gradient mismatch - std::vector stack({a, b}); - interp.run(stack); - ASSERT_FALSE(stack[2].toBool()); - } - { - auto a = at::zeros({2, 2}, at::kFloat); - auto b = at::ones({3, 3}, at::kFloat); - a = a.to(at::kCPU); - a.set_requires_grad(true); - a = a.to(at::kInt); // Scalar type mismatch - std::vector stack({a, b}); - interp.run(stack); - ASSERT_FALSE(stack[2].toBool()); - } - { - auto a = at::zeros({2, 2}, at::kFloat); - auto b = at::ones({3, 3}, at::kFloat); - a.set_requires_grad(true); - a = a.to(at::kCUDA); // Device mismatch - std::vector stack({a, b}); - interp.run(stack); - ASSERT_FALSE(stack[2].toBool()); - } + return InterpreterState(function); } +}; - try { // Test empty Typecheck raises an internal assertion - auto graph = std::make_shared(); - std::unordered_map vmap; - parseIR( - R"IR( -graph(%a.1 : Tensor, - %b.1 : Tensor): - %type_matched : bool = prim::TypeCheck() - return (%type_matched) - )IR", - &*graph, - vmap); - } catch (const std::exception& e) { - } - try { // Test for assertion if num_inputs + 1 != num_outputs - auto graph = std::make_shared(); - std::unordered_map vmap; - parseIR( - R"IR( -graph(%a.1 : Tensor, - %b.1 : Tensor): - %type_matched : bool = prim::TypeCheck(%a.1) - return (%type_matched) - )IR", - &*graph, - vmap); - } catch (const std::exception& e) { - } +TEST_F(TypeCheckTest, MatchingType) { + // TypeCheck yields to true! Shape, grad and device matches. + auto a = at::zeros({2, 2}, at::kFloat); + auto b = at::ones({3, 3}, at::kFloat); + a.set_requires_grad(true); + a = a.to(at::kCPU); + std::vector stack({a, b}); + interp.run(stack); + ASSERT_TRUE(exactlyEqual(stack[0].toTensor(), a)); + ASSERT_TRUE(exactlyEqual(stack[1].toTensor(), b)); + ASSERT_TRUE(stack[2].toBool()); +} + +TEST_F(TypeCheckTest, SizeMismatch) { + auto a = at::zeros({2, 2}, at::kFloat); + auto b = at::ones({2, 2}, at::kFloat); // Size mismatch + a.set_requires_grad(true); + a = a.to(at::kCPU); + std::vector stack({a, b}); + interp.run(stack); + ASSERT_FALSE(stack[2].toBool()); } -void testInterp() { + +TEST_F(TypeCheckTest, GradientMismatch) { + auto a = at::zeros({2, 2}, at::kFloat); + auto b = at::ones({3, 3}, at::kFloat); + a = a.to(at::kCPU); + a.set_requires_grad(false); // Gradient mismatch + std::vector stack({a, b}); + interp.run(stack); + ASSERT_FALSE(stack[2].toBool()); +} + +TEST_F(TypeCheckTest, ScalarTypeMismatch) { + auto a = at::zeros({2, 2}, at::kFloat); + auto b = at::ones({3, 3}, at::kFloat); + a = a.to(at::kCPU); + a.set_requires_grad(true); + a = a.to(at::kInt); // Scalar type mismatch + std::vector stack({a, b}); + interp.run(stack); + ASSERT_FALSE(stack[2].toBool()); +} + +TEST_F(TypeCheckTest, DeviceMismatch_CUDA) { + auto a = at::zeros({2, 2}, at::kFloat); + auto b = at::ones({3, 3}, at::kFloat); + a.set_requires_grad(true); + a = a.to(at::kCUDA); // Device mismatch + std::vector stack({a, b}); + interp.run(stack); + ASSERT_FALSE(stack[2].toBool()); +} + +// TODO: These tests weren't doing anything. +// TEST(TypeCheckErrorTest, EmptyCheckRaises) { +// // Test empty Typecheck raises an internal assertion +// auto graph = std::make_shared(); +// std::unordered_map vmap; +// EXPECT_ANY_THROW(parseIR( +// R"IR( +// graph(%a.1 : Tensor, +// %b.1 : Tensor): +// %type_matched : bool = prim::TypeCheck() +// return (%type_matched) +// )IR", +// &*graph, +// vmap)); +// } + +// TODO: These tests weren't doing anything. +// TEST(TypeCheckErrorTest, WrongInputOutputCountRaises) { +// // Test for assertion if num_inputs + 1 != num_outputs +// auto graph = std::make_shared(); +// std::unordered_map vmap; +// EXPECT_ANY_THROW(parseIR( +// R"IR( +// graph(%a.1 : Tensor, +// %b.1 : Tensor): +// %type_matched : bool = prim::TypeCheck(%a.1) +// return (%type_matched) +// )IR", +// &*graph, +// vmap)); +// } + +TEST(InterpreterTest, Basic_CUDA) { constexpr int batch_size = 4; constexpr int input_size = 256; constexpr int seq_len = 32; diff --git a/test/cpp/jit/test_ir.cpp b/test/cpp/jit/test_ir.cpp index a05ff70061bf..2423bbf0c773 100644 --- a/test/cpp/jit/test_ir.cpp +++ b/test/cpp/jit/test_ir.cpp @@ -1,11 +1,12 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/ir/irparser.h" namespace torch { namespace jit { -void testAttributes() { +TEST(IRTest, Attributes) { Graph g; auto one = attr::alpha; auto two = attr::device; @@ -33,7 +34,7 @@ void testAttributes() { ASSERT_EQ(attr2.f(one), 5); } -void testBlocks() { +TEST(IRTest, Blocks) { auto g = std::make_shared(); const auto graph_string = R"IR( graph(%a : Tensor, @@ -92,7 +93,7 @@ void testBlocks() { ->run(*g2); } -void testCommonAncestor() { +TEST(IRTest, CommonAncestor) { std::string input_str = R"( graph(%x : Tensor, %a.1 : bool, diff --git a/test/cpp/jit/test_irparser.cpp b/test/cpp/jit/test_irparser.cpp index a71b64a7b85b..57f21f5bf5f9 100644 --- a/test/cpp/jit/test_irparser.cpp +++ b/test/cpp/jit/test_irparser.cpp @@ -1,7 +1,8 @@ +#include + #include #include #include -#include "test/cpp/jit/test_base.h" #include #include @@ -38,52 +39,52 @@ static void checkRoundtrip(const std::string& s) { AT_ASSERT(original == parsed); } -void testIRParser() { - { - auto graph = std::make_shared(); - std::unordered_map vmap; - parseIR( - R"IR( +TEST(IRParserTest, Basic) { + auto graph = std::make_shared(); + std::unordered_map vmap; + parseIR( + R"IR( graph(%0 : Tensor, %1 : Tensor): %2 : Tensor = foo::add(%0, %1) %res, %3 = foo::mul(%0, %2) %x, %y = foo::combine(%res, %2, %3) return (%x, %y, %res))IR", - &*graph, - vmap); + &*graph, + vmap); - AT_ASSERT(graph->inputs().size() == 2); - AT_ASSERT(graph->outputs().size() == 3); - Value* x = graph->outputs()[0]; - Value* y = graph->outputs()[1]; - Value* res = graph->outputs()[2]; - Value* t0 = graph->inputs()[0]; - Value* t1 = graph->inputs()[1]; - AT_ASSERT(vmap["x"] == x); - AT_ASSERT(vmap["y"] == y); - AT_ASSERT(vmap["res"] == res); - AT_ASSERT(vmap["0"] == t0); - AT_ASSERT(vmap["1"] == t1); - AT_ASSERT(x->node() == y->node()); - Node* comb = x->node(); - Value* t2 = comb->inputs()[1]; - Value* t3 = comb->inputs()[2]; - AT_ASSERT(vmap["2"] == t2); - AT_ASSERT(vmap["3"] == t3); - AT_ASSERT(comb->kind().toQualString() == std::string("foo::combine")); - AT_ASSERT(comb->outputs() == std::vector({x, y})); - AT_ASSERT(comb->inputs() == std::vector({res, t2, t3})); - Node* mul = res->node(); - AT_ASSERT(mul->kind().toQualString() == std::string("foo::mul")); - AT_ASSERT(mul->inputs() == std::vector({t0, t2})); - AT_ASSERT(mul->outputs() == std::vector({res, t3})); - Node* add = t2->node(); - AT_ASSERT(add->kind().toQualString() == std::string("foo::add")); - AT_ASSERT(add->inputs() == std::vector({t0, t1})); - AT_ASSERT(add->outputs() == std::vector({t2})); - } - { - checkRoundtrip(R"IR( + AT_ASSERT(graph->inputs().size() == 2); + AT_ASSERT(graph->outputs().size() == 3); + Value* x = graph->outputs()[0]; + Value* y = graph->outputs()[1]; + Value* res = graph->outputs()[2]; + Value* t0 = graph->inputs()[0]; + Value* t1 = graph->inputs()[1]; + AT_ASSERT(vmap["x"] == x); + AT_ASSERT(vmap["y"] == y); + AT_ASSERT(vmap["res"] == res); + AT_ASSERT(vmap["0"] == t0); + AT_ASSERT(vmap["1"] == t1); + AT_ASSERT(x->node() == y->node()); + Node* comb = x->node(); + Value* t2 = comb->inputs()[1]; + Value* t3 = comb->inputs()[2]; + AT_ASSERT(vmap["2"] == t2); + AT_ASSERT(vmap["3"] == t3); + AT_ASSERT(comb->kind().toQualString() == std::string("foo::combine")); + AT_ASSERT(comb->outputs() == std::vector({x, y})); + AT_ASSERT(comb->inputs() == std::vector({res, t2, t3})); + Node* mul = res->node(); + AT_ASSERT(mul->kind().toQualString() == std::string("foo::mul")); + AT_ASSERT(mul->inputs() == std::vector({t0, t2})); + AT_ASSERT(mul->outputs() == std::vector({res, t3})); + Node* add = t2->node(); + AT_ASSERT(add->kind().toQualString() == std::string("foo::add")); + AT_ASSERT(add->inputs() == std::vector({t0, t1})); + AT_ASSERT(add->outputs() == std::vector({t2})); +} + +TEST(IRParserTest, NestedBlock) { + checkRoundtrip(R"IR( graph(): %0 : Tensor = a::a() block0(): @@ -95,9 +96,10 @@ graph(): %3 : Tensor = d::d() return (%3) )IR"); - } - { - checkRoundtrip(R"IR( +} + +TEST(IRParserTest, If) { + checkRoundtrip(R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): @@ -114,9 +116,10 @@ graph(%0 : Tensor, %11 : Tensor = aten::add(%5, %3, %10) return (%11) )IR"); - } - { - checkRoundtrip(R"IR( +} + +TEST(IRParserTest, If2) { + checkRoundtrip(R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): @@ -133,40 +136,43 @@ graph(%0 : Tensor, %11 : Tensor = aten::add(%5, %3, %10) return (%11) )IR"); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( +} + +TEST(IRParserTest, InferredTypeIsTensor) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%a): return (%a))IR", - &*graph); - AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get())); - } - { - // Check that parser correctly handles values reusing the same name. - auto graph = std::make_shared(); - parseIR( - R"IR( + &*graph); + AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get())); +} + +TEST(IRParserTest, ValueReuse) { + // Check that parser correctly handles values reusing the same name. + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%x): %x = a::a(%x) %x = b::b(%x) return (%x))IR", - &*graph); - Value* x0 = graph->inputs()[0]; - Value* x2 = graph->outputs()[0]; - Node* b = x2->node(); - Value* x1 = b->inputs()[0]; - Node* a = x1->node(); - AT_ASSERT(a->inputs() == std::vector({x0})); - AT_ASSERT(a->outputs() == std::vector({x1})); - AT_ASSERT(b->inputs() == std::vector({x1})); - AT_ASSERT(b->outputs() == std::vector({x2})); - } - { - // Check that parser handles attributes and types. - checkRoundtrip( - R"IR( + &*graph); + Value* x0 = graph->inputs()[0]; + Value* x2 = graph->outputs()[0]; + Node* b = x2->node(); + Value* x1 = b->inputs()[0]; + Node* a = x1->node(); + AT_ASSERT(a->inputs() == std::vector({x0})); + AT_ASSERT(a->outputs() == std::vector({x1})); + AT_ASSERT(b->inputs() == std::vector({x1})); + AT_ASSERT(b->outputs() == std::vector({x2})); +} + +TEST(IRParserTest, Attributes) { + // Check that parser handles attributes and types. + checkRoundtrip( + R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): @@ -176,155 +182,147 @@ graph(%0 : Tensor, %8 : string = z::z() return (%7) )IR"); - } +} - { - checkRoundtrip( - R"IR( +TEST(IRParserTest, OptionalTypes) { + checkRoundtrip( + R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): %3 : int? = prim::Constant() return (%3) )IR"); - } +} - { - checkRoundtrip( - R"IR( +TEST(IRParserTest, StarTensor) { + checkRoundtrip( + R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): %3 : Float(*, *, *) = prim::Constant() return (%3) )IR"); - } +} - { - checkRoundtrip( - R"IR( +TEST(IRParserTest, UnshapedTensor) { + checkRoundtrip( + R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): %3 : Long() = prim::Constant() return (%3) )IR"); - } +} - { - checkRoundtrip( - R"IR( +TEST(IRParserTest, ShapedTensor) { + checkRoundtrip( + R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): %3 : Double(4, 4, 5) = prim::Constant() return (%3) )IR"); - } +} - { - checkRoundtrip( - R"IR( +TEST(IRParserTest, NestedContrainer) { + checkRoundtrip( + R"IR( graph(): %0 : float[] = prim::Constant[value=[1., 2., 3.]]() %1 : str[] = prim::Constant[value=["ab", "cd", "ef"]]() %2 : (float[], str[]) = prim::TupleConstruct(%0, %1) return (%2) )IR"); - } +} - { - bool error_thrown = false; - try { - checkRoundtrip( - R"IR( +TEST(IRParserTest, MalformedShapeAnnotation) { + EXPECT_ANY_THROW(checkRoundtrip( + R"IR( graph(%0 : Tensor, %1 : Tensor, %2 : Tensor): %3 : Double(4!, 4, 5) = prim::Constant() return (%3) -)IR"); - } catch (const std::exception& error) { - error_thrown = true; - } - AT_ASSERT(error_thrown); - } +)IR")); +} - { - auto graph = std::make_shared(); - const std::string& text = - R"IR( +TEST(IRParserTest, FileCheck) { + auto graph = std::make_shared(); + const std::string& text = + R"IR( graph(%a): # CHECK: return return (%a))IR"; - parseIR(text, &*graph); - AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get())); - torch::jit::testing::FileCheck().run(text, *graph); - } + parseIR(text, &*graph); + AT_ASSERT(graph->inputs()[0]->type()->isSubtypeOf(TensorType::get())); + torch::jit::testing::FileCheck().run(text, *graph); +} - { - auto graph = std::make_shared(); - std::unordered_map vmap; - parseIR( - R"IR( +TEST(IRParserTest, Strides) { + auto graph = std::make_shared(); + std::unordered_map vmap; + parseIR( + R"IR( graph(%a : Float(4, 5), %b : Float(4:5, 5:1), %c : Double(*, *)): return (%a) )IR", - &*graph, - vmap); - Value* a = graph->inputs()[0]; - Value* b = graph->inputs()[1]; - Value* c = graph->inputs()[2]; + &*graph, + vmap); + Value* a = graph->inputs()[0]; + Value* b = graph->inputs()[1]; + Value* c = graph->inputs()[2]; - auto a_type = a->type()->cast(); - auto a_sizes = *a_type->sizes().concrete_sizes(); - auto a_strides = a_type->strides().concrete_sizes(); - AT_ASSERT(a_sizes[0] == 4 && a_sizes[1] == 5); - AT_ASSERT(a_strides == c10::nullopt); + auto a_type = a->type()->cast(); + auto a_sizes = *a_type->sizes().concrete_sizes(); + auto a_strides = a_type->strides().concrete_sizes(); + AT_ASSERT(a_sizes[0] == 4 && a_sizes[1] == 5); + AT_ASSERT(a_strides == c10::nullopt); - auto b_type = b->type()->cast(); - auto b_sizes = *b_type->sizes().concrete_sizes(); - auto b_strides = *(b_type->strides().sizes()); - AT_ASSERT(b_sizes[0] == 4 && b_sizes[1] == 5); - AT_ASSERT(*b_strides[0] == 5 && *b_strides[1] == 1); + auto b_type = b->type()->cast(); + auto b_sizes = *b_type->sizes().concrete_sizes(); + auto b_strides = *(b_type->strides().sizes()); + AT_ASSERT(b_sizes[0] == 4 && b_sizes[1] == 5); + AT_ASSERT(*b_strides[0] == 5 && *b_strides[1] == 1); - auto c_type = c->type()->cast(); - AT_ASSERT(*c_type->sizes().size() == 2); - AT_ASSERT(c_type->sizes().concrete_sizes() == c10::nullopt); - AT_ASSERT(c_type->strides().concrete_sizes() == c10::nullopt); - } - { - auto graph = std::make_shared(); - std::unordered_map vmap; - bool error_thrown = false; - try { - parseIR( - R"IR( + auto c_type = c->type()->cast(); + AT_ASSERT(*c_type->sizes().size() == 2); + AT_ASSERT(c_type->sizes().concrete_sizes() == c10::nullopt); + AT_ASSERT(c_type->strides().concrete_sizes() == c10::nullopt); +} + +TEST(IRParserTest, MalformedStrides) { + auto graph = std::make_shared(); + std::unordered_map vmap; + bool error_thrown = false; + EXPECT_ANY_THROW(parseIR( + R"IR( graph(%a : Float(4:5, 5)): return (%a) )IR", - &*graph, - vmap); - } catch (const std::exception& error) { - error_thrown = true; - } - AT_ASSERT(error_thrown); - } - { - checkRoundtrip( - R"IR( + &*graph, + vmap)); +} + +TEST(IRParserTest, TensorShapes) { + checkRoundtrip( + R"IR( graph(%a : Float(4, 5), %b : Float(4:5, 5:1), %c : Double(*, *)): return (%a) )IR"); - } - { - checkRoundtrip( - R"IR( +} + +TEST(IRParserTest, DeviceAndRequiresGradTensors) { + checkRoundtrip( + R"IR( graph(%a : Float(*, *, device=cpu), %b : Float(*, *, requires_grad=1), %c : Long(5, 10, requires_grad=1, device=cpu), @@ -337,41 +335,45 @@ graph(%a : Float(*, *, device=cpu), %j : Double(*, *, requires_grad=0)): return (%a) )IR"); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( +} + +TEST(IRParserTest, ListConstant) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %d : int[] = prim::Constant[value=[1,2,3]]() return (%d) )IR", - &*graph); - Node* n = graph->outputs()[0]->node(); - AT_ASSERT(n->kind() == prim::Constant); - AT_ASSERT(n->kindOf(attr::value) == AttributeKind::ival); - const auto& genericList = n->ival(attr::value).toList(); - std::vector int_vals; - for (const IValue& ival : genericList) { - int_vals.push_back(ival.toInt()); - } - AT_ASSERT(int_vals.size() == 3); - AT_ASSERT(int_vals[0] == 1 && int_vals[1] == 2 && int_vals[2] == 3); + &*graph); + Node* n = graph->outputs()[0]->node(); + AT_ASSERT(n->kind() == prim::Constant); + AT_ASSERT(n->kindOf(attr::value) == AttributeKind::ival); + const auto& genericList = n->ival(attr::value).toList(); + std::vector int_vals; + for (const IValue& ival : genericList) { + int_vals.push_back(ival.toInt()); } - { - checkRoundtrip( - R"IR( + AT_ASSERT(int_vals.size() == 3); + AT_ASSERT(int_vals[0] == 1 && int_vals[1] == 2 && int_vals[2] == 3); +} + +TEST(IRParserTest, PartialStarTensor) { + checkRoundtrip( + R"IR( graph(%x : Float(10, *, 10)): return (%x) )IR"); - checkRoundtrip( - R"IR( +} + +TEST(IRParserTest, ComplexTensorAttributes) { + checkRoundtrip( + R"IR( graph(%x : Double(*, 200, *, requires_grad=1, device=cuda:1), %b : Float(5, *, requires_grad=1), %c : Long(*, 10, device=cpu)): return (%x) )IR"); - } } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_jit_type.cpp b/test/cpp/jit/test_jit_type.cpp index 16c69ccd05fd..9462a572ea65 100644 --- a/test/cpp/jit/test_jit_type.cpp +++ b/test/cpp/jit/test_jit_type.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include #include "torch/csrc/jit/ir/ir.h" @@ -7,7 +8,7 @@ namespace torch { namespace jit { -void testUnifyTypes() { +TEST(JitTypeTest, UnifyTypes) { auto bool_tensor = TensorType::get()->withScalarType(at::kBool); auto opt_bool_tensor = OptionalType::create(bool_tensor); auto unified_opt_bool = unifyTypes(bool_tensor, opt_bool_tensor); diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index d09048413aec..b262075a42aa 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -1,5 +1,6 @@ +#include + #include -#include #include #include #include @@ -10,11 +11,19 @@ #include +#define ASSERT_THROWS_WITH(statement, substring) \ + try { \ + (void)statement; \ + ASSERT_TRUE(false); \ + } catch (const std::exception& e) { \ + ASSERT_NE(std::string(e.what()).find(substring), std::string::npos); \ + } + // Tests go in torch::jit namespace torch { namespace jit { -void testLiteInterpreterUpsampleNearest2d() { +TEST(LiteInterpreterTest, UpsampleNearest2d) { Module m("m"); m.define(R"( def forward(self, input: Tensor, scale:float): @@ -37,7 +46,7 @@ void testLiteInterpreterUpsampleNearest2d() { ASSERT_TRUE(resd.equal(refd)); } -void testLiteInterpreterAdd() { +TEST(LiteInterpreterTest, Add) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); // TODO: support default param val, which was pushed in @@ -71,7 +80,7 @@ void testLiteInterpreterAdd() { AT_ASSERT(resd == refd); } -void testLiteInterpreterConv() { +TEST(LiteInterpreterTest, Conv) { auto s = std::getenv("PYTORCH_TEST_WITH_TSAN"); if (s && strcmp(s, "1") == 0) return; @@ -103,7 +112,7 @@ void testLiteInterpreterConv() { outputref[0][0][0][0].item() == output[0][0][0][0].item()); } -void testLiteInterpreterInline() { +TEST(LiteInterpreterTest, Inline) { Module m("m"); m.define(R"JIT( def foo1(self, x): @@ -123,7 +132,7 @@ void testLiteInterpreterInline() { AT_ASSERT(output.toTensor().item() == 7.0); } -void testLiteInterpreterTuple() { +TEST(LiteInterpreterTest, Tuple) { Module m("m"); m.define(R"JIT( def foo(self, x): @@ -141,7 +150,7 @@ void testLiteInterpreterTuple() { AT_ASSERT(output.toTuple()->elements()[1].toInt() == 2); } -void testLiteInterpreterDict() { +TEST(LiteInterpreterTest, Dict) { Module m("m"); m.define(R"JIT( def foo(self, x): @@ -159,7 +168,7 @@ void testLiteInterpreterDict() { AT_ASSERT(output.toGenericDict().at("result").toTensor().item().toInt() == 2); } -void testLiteInterpreterPrimOverload() { +TEST(LiteInterpreterTest, PrimOverload) { /* // temporarily disabled script::Module m("m"); @@ -178,7 +187,7 @@ void testLiteInterpreterPrimOverload() { */ } -void testLiteInterpreterPrim() { +TEST(LiteInterpreterTest, Prim) { Module m("m"); m.define(R"JIT( def forward(self, x): @@ -204,7 +213,33 @@ void testLiteInterpreterPrim() { AT_ASSERT(resi == refi); } -void testLiteInterpreterLoadOrigJit() { +TEST(LiteInterpreterTest, PrimScalar) { + Module m("m"); + m.define(R"JIT( + def forward(self, x): + return int(x.item()) + )JIT"); + + std::vector inputs; + auto minput = 3.5 * torch::ones({}); + inputs.emplace_back(minput); + auto ref = m.run_method("forward", minput); + + std::stringstream ss; + m._save_for_mobile(ss); + mobile::Module bc = _load_for_mobile(ss); + IValue res; + for (int i = 0; i < 3; ++i) { + auto bcinputs = inputs; + res = bc.get_method("forward")(bcinputs); + } + + auto resi = res.toInt(); + auto refi = ref.toInt(); + AT_ASSERT(resi == refi); +} + +TEST(LiteInterpreterTest, LoadOrigJit) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -217,7 +252,7 @@ void testLiteInterpreterLoadOrigJit() { ASSERT_THROWS_WITH(_load_for_mobile(ss), "file not found"); } -void testLiteInterpreterWrongMethodName() { +TEST(LiteInterpreterTest, WrongMethodName) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -234,7 +269,7 @@ void testLiteInterpreterWrongMethodName() { ASSERT_THROWS_WITH(bc.get_method("forward")(inputs), "is not defined"); } -void testLiteInterpreterSetState() { +TEST(LiteInterpreterTest, SetState) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -282,7 +317,7 @@ class TorchBindLiteInterpreterTestStruct } }; -void testLiteInterpreterBuiltinFunction() { +TEST(LiteInterpreterTest, BuiltinFunction) { script::Module m("m"); auto custom_class_obj = make_custom_class(); @@ -302,7 +337,7 @@ void testLiteInterpreterBuiltinFunction() { AT_ASSERT(str == expected); } -void testLiteInterpreterModuleInfoBasic() { +TEST(LiteInterpreterTest, ModuleInfoBasic) { Module m("M"); m.define(R"JIT( def forward(self, x): @@ -331,7 +366,7 @@ void testLiteInterpreterModuleInfoBasic() { AT_ASSERT(module_debug_info_set == expected_result); } -void testLiteInterpreterNotSavingModuleInfo() { +TEST(LiteInterpreterTest, NotSaveModuleInfo) { Module m("M"); m.define(R"JIT( def forward(self, x): @@ -354,7 +389,7 @@ void testLiteInterpreterNotSavingModuleInfo() { } } -void testLiteInterpreterOneSubmoduleModuleInfo() { +TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) { Module a("A"); a.define(R"JIT( def forward(self, x): @@ -390,7 +425,7 @@ void testLiteInterpreterOneSubmoduleModuleInfo() { AT_ASSERT(module_debug_info_set == expected_result); } -void testLiteInterpreterTwoSubmodulesModuleInfo() { +TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) { Module a("A"); a.define(R"JIT( def forward(self, x): @@ -432,7 +467,7 @@ void testLiteInterpreterTwoSubmodulesModuleInfo() { AT_ASSERT(module_debug_info_set == expected_result); } -void testLiteInterpreterSequentialModuleInfo() { +TEST(LiteInterpreterTest, SequentialModuleInfo) { Module a("A"); a.define(R"JIT( def forward(self, x): @@ -474,7 +509,7 @@ void testLiteInterpreterSequentialModuleInfo() { AT_ASSERT(module_debug_info_set == expected_result); } -void testLiteInterpreterHierarchyModuleInfo() { +TEST(LiteInterpreterTest, HierarchyModuleInfo) { Module a("A"); a.define(R"JIT( def forward(self, x): @@ -520,7 +555,7 @@ void testLiteInterpreterHierarchyModuleInfo() { AT_ASSERT(module_debug_info_set == expected_result); } -void testLiteInterpreterDuplicatedClassTypeModuleInfo() { +TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) { Module a("A"); a.define(R"JIT( def forward(self, x): @@ -560,7 +595,7 @@ void testLiteInterpreterDuplicatedClassTypeModuleInfo() { AT_ASSERT(module_debug_info_set == expected_result); } -void testLiteInterpreterEval() { +TEST(LiteInterpreterTest, Eval) { std::vector inputs; Module m("m"); @@ -593,7 +628,7 @@ void testLiteInterpreterEval() { outputref[0][0][0][0].item() == output[0][0][0][0].item()); } -void testLiteInterpreterFindWrongMethodName() { +TEST(LiteInterpreterTest, FindWrongMethodName) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -607,7 +642,7 @@ void testLiteInterpreterFindWrongMethodName() { ASSERT_TRUE(bc.find_method("forward") == c10::nullopt); } -void testLiteInterpreterFindAndRunMethod() { +TEST(LiteInterpreterTest, FindAndRunMethod) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -637,7 +672,7 @@ void testLiteInterpreterFindAndRunMethod() { AT_ASSERT(resd == refd); } -void testLiteInterpreterRunMethodVariadic() { +TEST(LiteInterpreterTest, RunMethodVariadic) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( diff --git a/test/cpp/jit/test_lite_trainer.cpp b/test/cpp/jit/test_lite_trainer.cpp index b70c4db62c70..9a988ecb2db1 100644 --- a/test/cpp/jit/test_lite_trainer.cpp +++ b/test/cpp/jit/test_lite_trainer.cpp @@ -1,5 +1,6 @@ +#include + #include -#include #include #include #include @@ -16,7 +17,7 @@ namespace torch { namespace jit { -void testLiteInterpreterParams() { +TEST(LiteTrainerTest, Params) { Module m("m"); m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false); m.define(R"( @@ -74,7 +75,7 @@ void testLiteInterpreterParams() { AT_ASSERT(parameters[0].item() == bc_parameters[0].item()); } -void testMobileNamedParameters() { +TEST(MobileTest, NamedParameters) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -99,7 +100,7 @@ void testMobileNamedParameters() { } } -void testMobileSaveLoadData() { +TEST(MobileTest, SaveLoadData) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -127,7 +128,7 @@ void testMobileSaveLoadData() { } } -void testMobileSaveLoadParameters() { +TEST(MobileTest, SaveLoadParameters) { Module m("m"); m.register_parameter("foo", torch::ones({}), false); m.define(R"( @@ -157,7 +158,7 @@ void testMobileSaveLoadParameters() { } } -void testMobileSaveLoadParametersEmpty() { +TEST(MobileTest, SaveLoadParametersEmpty) { Module m("m"); m.define(R"( def add_it(self, x): @@ -180,7 +181,7 @@ void testMobileSaveLoadParametersEmpty() { AT_ASSERT(mobile_params.size() == 0); } -void testLiteSGD() { +TEST(LiteTrainerTest, SGD) { Module m("m"); m.register_parameter("foo", torch::ones({1}, at::requires_grad()), false); m.define(R"( @@ -253,7 +254,7 @@ struct DummyDataset : torch::data::datasets::Dataset { }; } // namespace -void testLiteSequentialSampler() { +TEST(LiteTrainerTest, SequentialSampler) { // test that sampler can be used with dataloader const int kBatchSize = 10; auto data_loader = diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 4941c11d6cae..d205ae3d58db 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -1,9 +1,10 @@ +#include + #include #include #include #include -#include "test/cpp/jit/test_base.h" #include "test/cpp/jit/test_utils.h" #include @@ -92,7 +93,7 @@ std::ostream& operator<<(std::ostream& out, const std::vector& list) { return out; } -void testInternedStrings() { +TEST(InternedStringsTest, Basic) { ASSERT_EQ(prim::Param, Symbol::prim("Param")); ASSERT_EQ(prim::Return, Symbol::prim("Return")); ASSERT_EQ(prim::Return.toUnqualString(), std::string("Return")); @@ -108,7 +109,7 @@ void testInternedStrings() { ASSERT_EQ(Symbol(symstart + 2).toUnqualString(), std::string("What2")); } -void testFromQualString() { +TEST(FromQualStringTest, Basic) { ASSERT_EQ(Symbol::fromQualString("prim::Param"), Symbol::prim("Param")); ASSERT_EQ(Symbol::fromQualString("aten::mm"), Symbol::aten("mm")); ASSERT_EQ(Symbol::fromQualString("onnx::LSTM"), Symbol::onnx("LSTM")); @@ -138,7 +139,7 @@ void testFromQualString() { } } -void testTHNNConv() { +TEST(THNNConvTest, Basic) { std::vector input_size = {4, 3, 15, 17}; // B x C x H x W std::vector kernel_size = {3, 5}; std::vector stride = {1, 2}; @@ -233,7 +234,7 @@ void testTHNNConv() { assertAllClose(tensor_grads_out, expected_tensor_grads_out); } -void testATenNativeBatchNorm() { +TEST(ATenNativeBatchNormTest, Basic) { // aten::native_batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor // running_mean, Tensor running_var, bool training, float momentum, float eps) // -> (Tensor, Tensor, Tensor) @@ -365,7 +366,7 @@ void testATenNativeBatchNorm() { assertAllClose(tensor_grads_out, expected_tensor_grads_out); } -void testCustomFusion() { +TEST(CustomFusionTest, Basic) { auto graph_string = R"IR( graph(%0 : Float(2, 3, 4), %1 : Float(2, 3, 4)): @@ -399,7 +400,7 @@ void testCustomFusion() { AT_ASSERT(hits == 2); } -void testCustomFusionNestedBlocks() { +TEST(CustomFusionTest, NestedBlocks) { auto graph_string = R"IR( graph(%0 : Float(2, 3, 4), %1 : Float(2, 3, 4), @@ -461,7 +462,8 @@ static const auto cf_examples = R"JIT( i += 1 return a )JIT"; -void testControlFlow() { + +TEST(ControlFlowTest, Basic) { auto cu = compile(cf_examples); auto run = [&](const std::string& name, std::vector stack) { @@ -484,13 +486,13 @@ void testControlFlow() { ASSERT_EQ(256, run_binary("while_test", 2, 0)); } -void testProto() { +TEST(ProtoTest, Basic) { ::ONNX_NAMESPACE::ModelProto proto; proto.set_producer_name("foo"); } // test a few features that are not directly used in schemas yet -void testSchemaParser() { +TEST(SchemaParserTest, NestedArrays) { // nested arrays auto s = parseSchema("at::what(int[][4] foo) -> ()"); ASSERT_TRUE(s.arguments().at(0).N() == 4); @@ -509,145 +511,151 @@ void testSchemaParser() { ->getElementType() ->expect() ->getElementType())); +} +TEST(SchemaParserTest, NamedReturns) { // named returns parseSchema("at::what(Tensor! i_will_be_written_to) -> ()"); auto s3 = parseSchema("at::what() -> (Tensor the_return, Tensor the_return2)"); ASSERT_TRUE(s3.returns().at(0).name() == "the_return"); ASSERT_TRUE(s3.returns().at(1).name() == "the_return2"); +} +TEST(SchemaParserTest, Futures) { // futures auto s4 = parseSchema("at::what(Future(int) foo) -> ()"); ASSERT_TRUE(IntType::get()->isSubtypeOf( s4.arguments().at(0).type()->expect()->getElementType())); +} +TEST(SchemaParserTest, AnnotatedAliasSets) { // test tensor with annotated alias sets parseSchema("at::what(Tensor(a) foo) -> (Tensor(a))"); +} - { - const auto s = parseSchema( - "at::what(Tensor(b|c)[](a!) list, Tensor(c) element)" - " -> (Tensor(b|c)[](a!))"); - - // The list itself is annotated with `a` - const auto& aliasInfo = *s.arguments().at(0).alias_info(); - ASSERT_TRUE( - aliasInfo.beforeSets() == - std::unordered_set{Symbol::fromQualString("alias::a")}); - ASSERT_TRUE(aliasInfo.isWrite()); - - // Check the contained types - ASSERT_TRUE(!aliasInfo.containedTypes().empty()); - const auto& containedAliasInfo = aliasInfo.containedTypes()[0]; - const auto expected = std::unordered_set{ - Symbol::fromQualString("alias::b"), - Symbol::fromQualString("alias::c"), - }; - ASSERT_TRUE(containedAliasInfo.beforeSets() == expected); - ASSERT_TRUE(containedAliasInfo.afterSets() == expected); - ASSERT_FALSE(containedAliasInfo.isWrite()); - } - { - const auto s = parseSchema( - "at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)" - " -> (Tensor(b|c)[](a!))"); - - // The list itself is annotated with `a` - const auto& aliasInfo = *s.arguments().at(0).alias_info(); - ASSERT_EQ( - aliasInfo.beforeSets(), - std::unordered_set{Symbol::fromQualString("alias::a")}); - ASSERT_EQ( - aliasInfo.afterSets(), - std::unordered_set{Symbol::fromQualString("alias::a")}); - ASSERT_TRUE(aliasInfo.isWrite()); - ASSERT_EQ(aliasInfo.containedTypes().size(), 1); - - // Check the contained types - ASSERT_TRUE(!aliasInfo.containedTypes().empty()); - const auto& containedAliasInfo = aliasInfo.containedTypes()[0]; - const auto expectedBefore = std::unordered_set{ - Symbol::fromQualString("alias::b"), - }; - const auto expectedAfter = std::unordered_set{ - Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")}; - ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore); - ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter); - ASSERT_FALSE(containedAliasInfo.isWrite()); - } +TEST(SchemaParserTest, BeforeAfterSets) { + const auto s = parseSchema( + "at::what(Tensor(b|c)[](a!) list, Tensor(c) element)" + " -> (Tensor(b|c)[](a!))"); + + // The list itself is annotated with `a` + const auto& aliasInfo = *s.arguments().at(0).alias_info(); + ASSERT_TRUE( + aliasInfo.beforeSets() == + std::unordered_set{Symbol::fromQualString("alias::a")}); + ASSERT_TRUE(aliasInfo.isWrite()); + + // Check the contained types + ASSERT_TRUE(!aliasInfo.containedTypes().empty()); + const auto& containedAliasInfo = aliasInfo.containedTypes()[0]; + const auto expected = std::unordered_set{ + Symbol::fromQualString("alias::b"), + Symbol::fromQualString("alias::c"), + }; + ASSERT_TRUE(containedAliasInfo.beforeSets() == expected); + ASSERT_TRUE(containedAliasInfo.afterSets() == expected); + ASSERT_FALSE(containedAliasInfo.isWrite()); } -void testTopologicalIndex() { - { - Graph graph; - auto node1 = graph.create(prim::AutogradZero); - auto node2 = graph.create(prim::AutogradZero); - auto node3 = graph.create(prim::AutogradZero); - auto node4 = graph.create(prim::AutogradZero); - - graph.appendNode(node4); - graph.prependNode(node1); - node2->insertAfter(node1); - node3->insertBefore(node4); - - // nodes should be in numerical order - ASSERT_TRUE(node1->isBefore(node2)); - ASSERT_TRUE(node1->isBefore(node3)); - ASSERT_TRUE(node1->isBefore(node4)); - ASSERT_TRUE(node2->isAfter(node1)); - ASSERT_TRUE(node2->isBefore(node3)); - ASSERT_TRUE(node2->isBefore(node4)); - ASSERT_FALSE(node3->isBefore(node1)); - ASSERT_FALSE(node3->isBefore(node2)); - ASSERT_FALSE(node3->isAfter(node4)); - - // Built up a block structure - // node3 - // /\ ... - // A B block1 - // \ ... - // C block2 - auto block1 = node3->addBlock(); - auto A = graph.create(prim::AutogradZero); - block1->appendNode(A); - auto B = graph.create(prim::AutogradZero); - block1->appendNode(B); - auto block2 = B->addBlock(); - auto C = graph.create(prim::AutogradZero); - block2->appendNode(C); - - // Check isAfter on different block levels - ASSERT_TRUE(node1->isBefore(A)); - ASSERT_TRUE(A->isBefore(B)); - ASSERT_TRUE(A->isBefore(C)); - - // make sure things don't blow up on deletions - node2->destroy(); - auto node2p = graph.create(prim::AutogradZero); - node2p->insertAfter(node1); - ASSERT_TRUE(node1->isBefore(node2p)); - ASSERT_TRUE(node2p->isBefore(node3)); +TEST(SchemaParserTest, BeforeAfterSets2) { + const auto s = parseSchema( + "at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)" + " -> (Tensor(b|c)[](a!))"); + + // The list itself is annotated with `a` + const auto& aliasInfo = *s.arguments().at(0).alias_info(); + ASSERT_EQ( + aliasInfo.beforeSets(), + std::unordered_set{Symbol::fromQualString("alias::a")}); + ASSERT_EQ( + aliasInfo.afterSets(), + std::unordered_set{Symbol::fromQualString("alias::a")}); + ASSERT_TRUE(aliasInfo.isWrite()); + ASSERT_EQ(aliasInfo.containedTypes().size(), 1); + + // Check the contained types + ASSERT_TRUE(!aliasInfo.containedTypes().empty()); + const auto& containedAliasInfo = aliasInfo.containedTypes()[0]; + const auto expectedBefore = std::unordered_set{ + Symbol::fromQualString("alias::b"), + }; + const auto expectedAfter = std::unordered_set{ + Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")}; + ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore); + ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter); + ASSERT_FALSE(containedAliasInfo.isWrite()); +} + +TEST(TopologicalIndexTest, Basic) { + Graph graph; + auto node1 = graph.create(prim::AutogradZero); + auto node2 = graph.create(prim::AutogradZero); + auto node3 = graph.create(prim::AutogradZero); + auto node4 = graph.create(prim::AutogradZero); + + graph.appendNode(node4); + graph.prependNode(node1); + node2->insertAfter(node1); + node3->insertBefore(node4); + + // nodes should be in numerical order + ASSERT_TRUE(node1->isBefore(node2)); + ASSERT_TRUE(node1->isBefore(node3)); + ASSERT_TRUE(node1->isBefore(node4)); + ASSERT_TRUE(node2->isAfter(node1)); + ASSERT_TRUE(node2->isBefore(node3)); + ASSERT_TRUE(node2->isBefore(node4)); + ASSERT_FALSE(node3->isBefore(node1)); + ASSERT_FALSE(node3->isBefore(node2)); + ASSERT_FALSE(node3->isAfter(node4)); + + // Built up a block structure + // node3 + // /\ ... + // A B block1 + // \ ... + // C block2 + auto block1 = node3->addBlock(); + auto A = graph.create(prim::AutogradZero); + block1->appendNode(A); + auto B = graph.create(prim::AutogradZero); + block1->appendNode(B); + auto block2 = B->addBlock(); + auto C = graph.create(prim::AutogradZero); + block2->appendNode(C); + + // Check isAfter on different block levels + ASSERT_TRUE(node1->isBefore(A)); + ASSERT_TRUE(A->isBefore(B)); + ASSERT_TRUE(A->isBefore(C)); + + // make sure things don't blow up on deletions + node2->destroy(); + auto node2p = graph.create(prim::AutogradZero); + node2p->insertAfter(node1); + ASSERT_TRUE(node1->isBefore(node2p)); + ASSERT_TRUE(node2p->isBefore(node3)); +} + +TEST(TopologicalIndexTest, Reindex) { + // Induce reindexing to test that path + Graph graph; + std::map nodes; + + auto anchor = graph.create(prim::AutogradZero); + graph.appendNode(anchor); + // Inserting to the same place a lot will trigger reindexing + for (auto i = 0; i < 100; ++i) { + auto n = graph.create(prim::AutogradZero); + n->insertAfter(anchor); + nodes[i] = n; } - { - // Induce reindexing to test that path - Graph graph; - std::map nodes; - - auto anchor = graph.create(prim::AutogradZero); - graph.appendNode(anchor); - // Inserting to the same place a lot will trigger reindexing - for (auto i = 0; i < 100; ++i) { - auto n = graph.create(prim::AutogradZero); - n->insertAfter(anchor); - nodes[i] = n; - } - // Nodes should be in reverse order - for (auto i = 0; i < 100; ++i) { - for (auto j = i + 1; j < 100; ++j) { - ASSERT_TRUE(nodes[i]->isAfter(nodes[j])); - } + // Nodes should be in reverse order + for (auto i = 0; i < 100; ++i) { + for (auto j = i + 1; j < 100; ++j) { + ASSERT_TRUE(nodes[i]->isAfter(nodes[j])); } } } @@ -770,7 +778,7 @@ void checkScopeCallbacks() { TORCH_CHECK(found_user_scope); } -void testRecordFunction() { +TEST(RecordFunctionTest, Basic) { // disabling the inlining of method calls GraphOptimizerEnabledGuard opt_guard(false); @@ -817,7 +825,6 @@ void testRecordFunction() { traced_inputs.clear(); } - TORCH_CHECK(ts_names.size() == 2); TORCH_CHECK(ts_names.find("forward") != ts_names.end()); TORCH_CHECK(ts_names.find("foo") != ts_names.end()); @@ -1136,7 +1143,7 @@ void checkDebugInfo(c10::DebugInfoKind kind, int model_id) { TORCH_CHECK(test_debug_info->getModelId() == model_id); } -void testThreadLocalDebugInfo() { +TEST(ThreadLocalDebugInfoTest, Basic) { TORCH_CHECK( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); auto debug_info = std::make_shared(); @@ -1209,7 +1216,7 @@ void testThreadLocalDebugInfo() { } } -void testFallbackGraphs() { +TEST(FallbackGraphsTest, Basic) { static const auto nestGraphIntoFallbackGraph = [](const std::shared_ptr& graph) { ProfilingRecord::removeProfileCounter(graph->block()); @@ -1285,35 +1292,36 @@ void testFallbackGraphs() { } } -void testAutogradProfiler() { - constexpr int batch_size = 4; - constexpr int input_size = 256; - constexpr int seq_len = 32; - - int hidden_size = 2 * input_size; - auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU); - auto hx = torch::randn({batch_size, hidden_size}, at::kCPU); - auto cx = torch::randn({batch_size, hidden_size}, at::kCPU); - auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU)); - auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU)); - - std::stringstream ss; - { - RecordProfile guard(ss); - for (size_t i = 0; i < 100; ++i) { - std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh); - } - } - - std::string result = ss.str(); - size_t count = 0; - for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos; - count++, pos++) { - } - TORCH_CHECK(count == 200); -} - -void testNoneSchemaMatch() { +// TODO this test wasn't running and is broken. +// TEST(AutogradProfilerTest, Basic) { +// constexpr int batch_size = 4; +// constexpr int input_size = 256; +// constexpr int seq_len = 32; + +// int hidden_size = 2 * input_size; +// auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU); +// auto hx = torch::randn({batch_size, hidden_size}, at::kCPU); +// auto cx = torch::randn({batch_size, hidden_size}, at::kCPU); +// auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU)); +// auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU)); + +// std::stringstream ss; +// { +// RecordProfile guard(ss); +// for (size_t i = 0; i < 100; ++i) { +// std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh); +// } +// } + +// std::string result = ss.str(); +// size_t count = 0; +// for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos; +// count++, pos++) { +// } +// ASSERT_EQ((count, 200); +// } + +TEST(NoneSchemaMatchTest, Basic) { RegisterOperators reg({ Operator( "prim::test_none() -> int?", @@ -1348,40 +1356,6 @@ void testNoneSchemaMatch() { AT_ASSERT(std::distance(nodes.begin(), nodes.end()) == 1); } -void testModuleDefine() { - Module m("m"); - m.register_parameter("foo", torch::ones({}), false); - m.define(R"( - def add_it(self, x, b : int = 4): - return self.foo + x + b - )"); - auto result = m.run_method("add_it", torch::ones({})); - AT_ASSERT(result.toTensor().item() == 6); -} - -void testModuleConversion() { - Module m("test"); - { - // test cuda to cpu for params and buffers - m.register_parameter("foo", torch::ones({}, at::kCUDA), false); - m.register_buffer("bar", torch::ones({}, at::kCUDA)); - - m.to(at::kCUDA); - m.to(at::kCPU); - AT_ASSERT(m.attr("foo").toTensor().device().is_cpu()); - AT_ASSERT(m.attr("bar").toTensor().device().is_cpu()); - } - { - // test cpu to cuda for params and buffers - m.register_parameter("foo", torch::ones({}), false); - m.register_buffer("bar", torch::ones({})); - - m.to(at::kCUDA); - AT_ASSERT(m.attr("foo").toTensor().device().is_cuda()); - AT_ASSERT(m.attr("bar").toTensor().device().is_cuda()); - } -} - static int testPassValue = 0; void fakePass(std::shared_ptr& g) { testPassValue++; @@ -1390,7 +1364,7 @@ void fakePass(std::shared_ptr& g) { RegisterPass p(fakePass); -void testPassManagement() { +TEST(PassManagementTest, Basic) { std::shared_ptr graph = std::make_shared(); parseIR( R"IR( @@ -1447,14 +1421,17 @@ size_t countNodes( return count; } -void testLoopPeeler() { - // peel all loops - auto true_pred = [](Node* n) { return true; }; - auto is_loop = [](Node* n) { return n->kind() == prim::Loop; }; +bool true_pred(Node* n) { + return true; +}; + +bool is_loop(Node* n) { + return n->kind() == prim::Loop; +}; +TEST(LoopPeelerTest, NoInductionVariableUse) { // do not use an induction variable explicitly - { - static const auto str_func_def = R"JIT( + static const auto str_func_def = R"JIT( def test_peel_n_times(): sum = 0 for i in range(10): @@ -1462,41 +1439,41 @@ void testLoopPeeler() { return sum )JIT"; - auto cu = compile(str_func_def); - auto& f = cu->get_function("test_peel_n_times"); - auto stack = createStack({}); - // peeling loop once - { - LoopsPeeler peeler(true_pred, 1); - auto copy = f.graph()->copy(); - peeler.run(copy); - int num_loops = - std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); - ASSERT_EQ(num_loops, 2); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 20); - } + auto cu = compile(str_func_def); + auto& f = cu->get_function("test_peel_n_times"); + auto stack = createStack({}); + // peeling loop once + { + LoopsPeeler peeler(true_pred, 1); + auto copy = f.graph()->copy(); + peeler.run(copy); + int num_loops = + std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); + ASSERT_EQ(num_loops, 2); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 20); + } - // test peeling more than one iteration - { - LoopsPeeler peeler(true_pred, 3); - auto copy = f.graph()->copy(); - peeler.run(copy); - int num_loops = - std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); - ASSERT_EQ(num_loops, 2); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 20); - } + // test peeling more than one iteration + { + LoopsPeeler peeler(true_pred, 3); + auto copy = f.graph()->copy(); + peeler.run(copy); + int num_loops = + std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); + ASSERT_EQ(num_loops, 2); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 20); } +} +TEST(LoopPeelerTest, YesInductionVariableUse) { // uses the induction variable - { - static const auto str_func_def = R"JIT( + static const auto str_func_def = R"JIT( def test_peel_n_times(): sum = 0 for i in range(10): @@ -1504,41 +1481,41 @@ void testLoopPeeler() { return sum )JIT"; - auto cu = compile(str_func_def); - auto& f = cu->get_function("test_peel_n_times"); - auto stack = createStack({}); - // peeling loop once - { - LoopsPeeler peeler(true_pred, 1); - auto copy = f.graph()->copy(); - peeler.run(copy); - int num_loops = - std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); - ASSERT_EQ(num_loops, 2); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 45); - } + auto cu = compile(str_func_def); + auto& f = cu->get_function("test_peel_n_times"); + auto stack = createStack({}); + // peeling loop once + { + LoopsPeeler peeler(true_pred, 1); + auto copy = f.graph()->copy(); + peeler.run(copy); + int num_loops = + std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); + ASSERT_EQ(num_loops, 2); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 45); + } - // test peeling more than one iteration - { - LoopsPeeler peeler(true_pred, 3); - auto copy = f.graph()->copy(); - peeler.run(copy); - int num_loops = - std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); - ASSERT_EQ(num_loops, 2); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 45); - } + // test peeling more than one iteration + { + LoopsPeeler peeler(true_pred, 3); + auto copy = f.graph()->copy(); + peeler.run(copy); + int num_loops = + std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); + ASSERT_EQ(num_loops, 2); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 45); } +} +TEST(LoopPeelerTest, LoopWithTerminationCondition) { // tests with explicit termination conditions - { - static const auto str_func_def = R"JIT( + static const auto str_func_def = R"JIT( def test_with_cond_times(): sum = 0 i = 0 @@ -1548,44 +1525,44 @@ void testLoopPeeler() { return sum )JIT"; - // the peel changes the termination condition to false - // so the original loop doesn't run - auto cu = compile(str_func_def); - auto& f = cu->get_function("test_with_cond_times"); - auto stack = createStack({}); - // peeling 5 iterations should update the termination - // condition to false - { - LoopsPeeler peeler(true_pred, 5); - auto copy = f.graph()->copy(); - peeler.run(copy); - int num_loops = - std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); - ASSERT_EQ(num_loops, 2); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 3); - } - - // the termination condition remains true - { - LoopsPeeler peeler(true_pred, 1); - auto copy = f.graph()->copy(); - peeler.run(copy); - int num_loops = - std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); - ASSERT_EQ(num_loops, 2); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 3); - } + // the peel changes the termination condition to false + // so the original loop doesn't run + auto cu = compile(str_func_def); + auto& f = cu->get_function("test_with_cond_times"); + auto stack = createStack({}); + // peeling 5 iterations should update the termination + // condition to false + { + LoopsPeeler peeler(true_pred, 5); + auto copy = f.graph()->copy(); + peeler.run(copy); + int num_loops = + std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); + ASSERT_EQ(num_loops, 2); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 3); } - // tests simple nested loops + // the termination condition remains true { - static const auto str_func_def = R"JIT( + LoopsPeeler peeler(true_pred, 1); + auto copy = f.graph()->copy(); + peeler.run(copy); + int num_loops = + std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop); + ASSERT_EQ(num_loops, 2); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 3); + } +} + +// tests simple nested loops +TEST(LoopPeelerTest, SimpleNestedLoops) { + static const auto str_func_def = R"JIT( def test_nested_loops(): sum = 0 i = 0 @@ -1595,35 +1572,35 @@ void testLoopPeeler() { return sum )JIT"; - auto cu = compile(str_func_def); - auto& f = cu->get_function("test_nested_loops"); - auto stack = createStack({}); + auto cu = compile(str_func_def); + auto& f = cu->get_function("test_nested_loops"); + auto stack = createStack({}); - { - LoopsPeeler peeler(true_pred, 1); - auto copy = f.graph()->copy(); - peeler.run(copy); - ASSERT_EQ(countNodes(copy, is_loop), 5); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 900); - } - - { - LoopsPeeler peeler(true_pred, 5); - auto copy = f.graph()->copy(); - peeler.run(copy); - ASSERT_EQ(countNodes(copy, is_loop), 5); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 900); - } + { + LoopsPeeler peeler(true_pred, 1); + auto copy = f.graph()->copy(); + peeler.run(copy); + ASSERT_EQ(countNodes(copy, is_loop), 5); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 900); } { - static const auto str_func_def = R"JIT( + LoopsPeeler peeler(true_pred, 5); + auto copy = f.graph()->copy(); + peeler.run(copy); + ASSERT_EQ(countNodes(copy, is_loop), 5); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 900); + } +} + +TEST(LoopPeelerTest, SimpleNestedLoops2) { + static const auto str_func_def = R"JIT( def test_nested_loops(): sum = 0 i = 0 @@ -1635,34 +1612,33 @@ void testLoopPeeler() { return sum )JIT"; - auto cu = compile(str_func_def); - auto& f = cu->get_function("test_nested_loops"); - auto stack = createStack({}); - { - LoopsPeeler peeler(true_pred, 1); - auto copy = f.graph()->copy(); - peeler.run(copy); - ASSERT_EQ(countNodes(copy, is_loop), 5); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 3); - } + auto cu = compile(str_func_def); + auto& f = cu->get_function("test_nested_loops"); + auto stack = createStack({}); + { + LoopsPeeler peeler(true_pred, 1); + auto copy = f.graph()->copy(); + peeler.run(copy); + ASSERT_EQ(countNodes(copy, is_loop), 5); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 3); + } - { - LoopsPeeler peeler(true_pred, 5); - auto copy = f.graph()->copy(); - peeler.run(copy); - ASSERT_EQ(countNodes(copy, is_loop), 5); - Code code(copy, ""); - InterpreterState interpreter{code}; - interpreter.run(stack); - ASSERT_EQ(stack.back().toInt(), 3); - } + { + LoopsPeeler peeler(true_pred, 5); + auto copy = f.graph()->copy(); + peeler.run(copy); + ASSERT_EQ(countNodes(copy, is_loop), 5); + Code code(copy, ""); + InterpreterState interpreter{code}; + interpreter.run(stack); + ASSERT_EQ(stack.back().toInt(), 3); } } -void testInsertAndEliminateRedundantGuards() { +TEST(InsertAndEliminateRedundantGuardsTest, Basic) { static const auto basic_example = R"JIT( def basic(x, y): a = x + y @@ -1705,7 +1681,7 @@ void testInsertAndEliminateRedundantGuards() { ASSERT_EQ(num_guards, 2); } -void testInsertBailOuts() { +TEST(InsertBailOutsTest, Basic) { static const auto basic_example = R"JIT( def basic_loop(x, y): @@ -1754,7 +1730,7 @@ void testInsertBailOuts() { } } -void testProfiler() { +TEST(ProfilerTest, Basic) { constexpr int batch_size = 4; constexpr int input_size = 256; @@ -1804,7 +1780,7 @@ void testProfiler() { checkShape(tanh_n->inputs().at(0)->node()->ty(attr::profiled_type), eltwise); } -void testCallStack() { +TEST(CallStackTest, Basic) { const auto text = R"( def ham(x): return x/7 @@ -1880,7 +1856,7 @@ def foo(x): } } -void testCallStackCaching() { +TEST(CallStackTest, Caching) { const auto text = R"( def a(x): @@ -1923,7 +1899,7 @@ def c(x): ASSERT_TRUE(callstack_objects.at("a1") == callstack_objects.at("a2")); } -void testAutogradSymbols() { +TEST(AutogradSymbolsTest, Basic) { Symbol sym = Symbol::fromQualString("aten::test_symbol"); Graph graph; auto node = graph.create(sym); @@ -1942,7 +1918,7 @@ void testAutogradSymbols() { TORCH_CHECK(!canRunWithAutograd(node)); } -void testDefaultArgTypeHinting() { +TEST(DefaultArgTypeHintingTest, Basic) { const auto text_non_hinted = R"( def a(x, y=1): @@ -1968,184 +1944,182 @@ def a(x, y:int=1): auto cu = compile(text_hinted); } -void testFutures() { - // Basic set case. - { - auto f1 = c10::make_intrusive(IntType::get()); - ASSERT_FALSE(f1->completed()); - ASSERT_FALSE(f1->hasValue()); - int32_t sat1 = 0; - int32_t sat2 = 0; - f1->addCallback([&]() { ++sat1; }); - f1->markCompleted(43); - ASSERT_TRUE(f1->completed()); - ASSERT_TRUE(f1->hasValue()); - ASSERT_FALSE(f1->hasError()); - ASSERT_EQ(sat1, 1); - ASSERT_EQ(f1->constValue().toInt(), 43); - ASSERT_EQ(f1->value().toInt(), 43); - f1->addCallback([&]() { ++sat2; }); - ASSERT_EQ(sat1, 1); - ASSERT_EQ(sat2, 1); - } +// Basic set case. +TEST(FuturesTest, Basic) { + auto f1 = c10::make_intrusive(IntType::get()); + ASSERT_FALSE(f1->completed()); + ASSERT_FALSE(f1->hasValue()); + int32_t sat1 = 0; + int32_t sat2 = 0; + f1->addCallback([&]() { ++sat1; }); + f1->markCompleted(43); + ASSERT_TRUE(f1->completed()); + ASSERT_TRUE(f1->hasValue()); + ASSERT_FALSE(f1->hasError()); + ASSERT_EQ(sat1, 1); + ASSERT_EQ(f1->constValue().toInt(), 43); + ASSERT_EQ(f1->value().toInt(), 43); + f1->addCallback([&]() { ++sat2; }); + ASSERT_EQ(sat1, 1); + ASSERT_EQ(sat2, 1); +} - // Basic error cases. - { - auto f1 = c10::make_intrusive(IntType::get()); - int sat1 = 0; - int sat2 = 0; - f1->addCallback([&]() { ++sat1; }); - f1->setError( - std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed"))); - ASSERT_EQ(sat1, 1); - ASSERT_TRUE(f1->completed()); - ASSERT_TRUE(f1->hasError()); - ASSERT_FALSE(f1->hasValue()); - try { - (void)f1->value(); - ASSERT_TRUE(false); // Supposed to throw. - } catch (const std::exception& e) { - ASSERT_TRUE(strcmp(e.what(), "Failed") == 0); - } - f1->addCallback([&]() { ++sat2; }); - ASSERT_EQ(sat1, 1); - ASSERT_EQ(sat2, 1); - f1->setErrorIfNeeded( - std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup"))); - ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0); - ASSERT_EQ(sat1, 1); - ASSERT_EQ(sat2, 1); +// Basic error cases. +TEST(FuturesTest, Error) { + auto f1 = c10::make_intrusive(IntType::get()); + int sat1 = 0; + int sat2 = 0; + f1->addCallback([&]() { ++sat1; }); + f1->setError( + std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed"))); + ASSERT_EQ(sat1, 1); + ASSERT_TRUE(f1->completed()); + ASSERT_TRUE(f1->hasError()); + ASSERT_FALSE(f1->hasValue()); + try { + (void)f1->value(); + ASSERT_TRUE(false); // Supposed to throw. + } catch (const std::exception& e) { + ASSERT_TRUE(strcmp(e.what(), "Failed") == 0); } + f1->addCallback([&]() { ++sat2; }); + ASSERT_EQ(sat1, 1); + ASSERT_EQ(sat2, 1); + f1->setErrorIfNeeded( + std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup"))); + ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0); + ASSERT_EQ(sat1, 1); + ASSERT_EQ(sat2, 1); +} - // then - { - auto f1 = c10::make_intrusive(IntType::get()); - auto f2 = f1->then( - [f1]() -> IValue { return f1->constValue().toInt() + 1; }, - IntType::get()); - auto f3 = f2->then( - [f2]() -> IValue { return f2->constValue().toInt() * 3; }, - IntType::get()); - bool done = false; - f3->addCallback([f3, &done]() { - ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3); - done = true; - }); - ASSERT_FALSE(done); - f1->markCompleted(42); - ASSERT_TRUE(done); - } +// then +TEST(FuturesTest, Then) { + auto f1 = c10::make_intrusive(IntType::get()); + auto f2 = f1->then( + [f1]() -> IValue { return f1->constValue().toInt() + 1; }, + IntType::get()); + auto f3 = f2->then( + [f2]() -> IValue { return f2->constValue().toInt() * 3; }, + IntType::get()); + bool done = false; + f3->addCallback([f3, &done]() { + ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3); + done = true; + }); + ASSERT_FALSE(done); + f1->markCompleted(42); + ASSERT_TRUE(done); +} - // collectAll() - { - auto s1 = c10::make_intrusive(IntType::get()); - auto s2 = c10::make_intrusive(IntType::get()); - auto s3 = c10::make_intrusive(IntType::get()); - - // Empty case - c10::List> futures( - FutureType::create(IntType::get())); - auto c1 = collectAll(futures); - ASSERT_TRUE(c1->completed()); - ASSERT_EQ(c1->value().toList().size(), 0); - ASSERT_TRUE( - *(c1->value().toList().elementType()) == - *FutureType::create(IntType::get())); - - // 1-element, initially not completed. - futures.push_back(s1); - auto c2 = collectAll(futures); - ASSERT_FALSE(c2->completed()); - s1->markCompleted(5); - ASSERT_TRUE(c2->completed()); - ASSERT_EQ(c2->value().toList().size(), 1); - ASSERT_TRUE( - *(c2->value().toList().elementType()) == - *FutureType::create(IntType::get())); - ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5); - - // 1-element, already completed - auto c3 = collectAll(futures); - ASSERT_TRUE(c3->completed()); - ASSERT_EQ(c3->value().toList().size(), 1); - ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5); - - // 3 elements. - futures.push_back(s2); - futures.push_back(s3); - auto c4 = collectAll(futures); - ASSERT_FALSE(c4->completed()); - s3->markCompleted(7); - ASSERT_FALSE(c4->completed()); - s2->markCompleted(6); - ASSERT_TRUE(c4->completed()); - ASSERT_EQ(c4->value().toList().size(), 3); - ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5); - ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6); - ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7); - ASSERT_TRUE( - *(c4->value().toList().elementType()) == - *FutureType::create(IntType::get())); - - // Handle exception in the list. - auto s4 = c10::make_intrusive(IntType::get()); - futures.push_back(s4); - auto c5 = collectAll(futures); - ASSERT_FALSE(c5->completed()); - s4->setError( - std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed"))); - ASSERT_TRUE(c5->completed()); - ASSERT_EQ(c5->value().toList().size(), 4); - try { - (void)c5->value().toList().get(3).toFuture()->value(); - ASSERT_TRUE(false); // supposed to throw - } catch (const std::exception& e) { - ASSERT_EQ(std::string(e.what()), "Failed"); - } +// collectAll() +TEST(FuturesTest, CollectAll) { + auto s1 = c10::make_intrusive(IntType::get()); + auto s2 = c10::make_intrusive(IntType::get()); + auto s3 = c10::make_intrusive(IntType::get()); + + // Empty case + c10::List> futures( + FutureType::create(IntType::get())); + auto c1 = collectAll(futures); + ASSERT_TRUE(c1->completed()); + ASSERT_EQ(c1->value().toList().size(), 0); + ASSERT_TRUE( + *(c1->value().toList().elementType()) == + *FutureType::create(IntType::get())); + + // 1-element, initially not completed. + futures.push_back(s1); + auto c2 = collectAll(futures); + ASSERT_FALSE(c2->completed()); + s1->markCompleted(5); + ASSERT_TRUE(c2->completed()); + ASSERT_EQ(c2->value().toList().size(), 1); + ASSERT_TRUE( + *(c2->value().toList().elementType()) == + *FutureType::create(IntType::get())); + ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5); + + // 1-element, already completed + auto c3 = collectAll(futures); + ASSERT_TRUE(c3->completed()); + ASSERT_EQ(c3->value().toList().size(), 1); + ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5); + + // 3 elements. + futures.push_back(s2); + futures.push_back(s3); + auto c4 = collectAll(futures); + ASSERT_FALSE(c4->completed()); + s3->markCompleted(7); + ASSERT_FALSE(c4->completed()); + s2->markCompleted(6); + ASSERT_TRUE(c4->completed()); + ASSERT_EQ(c4->value().toList().size(), 3); + ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5); + ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6); + ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7); + ASSERT_TRUE( + *(c4->value().toList().elementType()) == + *FutureType::create(IntType::get())); + + // Handle exception in the list. + auto s4 = c10::make_intrusive(IntType::get()); + futures.push_back(s4); + auto c5 = collectAll(futures); + ASSERT_FALSE(c5->completed()); + s4->setError( + std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed"))); + ASSERT_TRUE(c5->completed()); + ASSERT_EQ(c5->value().toList().size(), 4); + try { + (void)c5->value().toList().get(3).toFuture()->value(); + ASSERT_TRUE(false); // supposed to throw + } catch (const std::exception& e) { + ASSERT_EQ(std::string(e.what()), "Failed"); } +} - // collectAny() - { - auto s1 = c10::make_intrusive(IntType::get()); - - // Empty case - c10::List> futures( - FutureType::create(IntType::get())); - auto c1 = collectAny(futures); - ASSERT_TRUE(c1->completed()); - - // 1 element, not yet satisfied - futures.push_back(s1); - auto c2 = collectAny(futures); - ASSERT_FALSE(c2->completed()); - s1->markCompleted(5); - ASSERT_TRUE(c2->completed()); - ASSERT_TRUE(c2->value().isInt()); - ASSERT_EQ(c2->value().toInt(), 5); - - // 1 element already satisfied. - auto c3 = collectAny(futures); - ASSERT_TRUE(c3->completed()); - ASSERT_TRUE(c3->value().isInt()); - ASSERT_EQ(c3->value().toInt(), 5); - - // 2 elements - futures.clear(); - auto s2 = c10::make_intrusive(IntType::get()); - auto s3 = c10::make_intrusive(IntType::get()); - futures.push_back(s2); - futures.push_back(s3); - auto c4 = collectAny(futures); - ASSERT_FALSE(c4->completed()); - s3->markCompleted(7); - ASSERT_TRUE(c4->completed()); - ASSERT_EQ(c4->value().toInt(), 7); - s2->markCompleted(1); - ASSERT_EQ(c4->value().toInt(), 7); - } +// collectAny() +TEST(FuturesTest, CollectAny) { + auto s1 = c10::make_intrusive(IntType::get()); + + // Empty case + c10::List> futures( + FutureType::create(IntType::get())); + auto c1 = collectAny(futures); + ASSERT_TRUE(c1->completed()); + + // 1 element, not yet satisfied + futures.push_back(s1); + auto c2 = collectAny(futures); + ASSERT_FALSE(c2->completed()); + s1->markCompleted(5); + ASSERT_TRUE(c2->completed()); + ASSERT_TRUE(c2->value().isInt()); + ASSERT_EQ(c2->value().toInt(), 5); + + // 1 element already satisfied. + auto c3 = collectAny(futures); + ASSERT_TRUE(c3->completed()); + ASSERT_TRUE(c3->value().isInt()); + ASSERT_EQ(c3->value().toInt(), 5); + + // 2 elements + futures.clear(); + auto s2 = c10::make_intrusive(IntType::get()); + auto s3 = c10::make_intrusive(IntType::get()); + futures.push_back(s2); + futures.push_back(s3); + auto c4 = collectAny(futures); + ASSERT_FALSE(c4->completed()); + s3->markCompleted(7); + ASSERT_TRUE(c4->completed()); + ASSERT_EQ(c4->value().toInt(), 7); + s2->markCompleted(1); + ASSERT_EQ(c4->value().toInt(), 7); } -void testTLSFutureCallbacks() { +TEST(TLSFutureCallbacksTest, Basic) { // cb that verifies the profiler is enabled auto profilerEnabledCb = []() { ASSERT_TRUE(torch::autograd::profiler::profilerEnabled()); @@ -2184,5 +2158,75 @@ void testTLSFutureCallbacks() { } } +TEST(ProfilerDisableInCallbackTest, Basic) { + // cb that verifies the profiler is enabled + auto profilerEnabledCb = []() { + ASSERT_TRUE(torch::autograd::profiler::profilerEnabled()); + }; + torch::autograd::profiler::enableProfiler( + torch::autograd::profiler::ProfilerConfig( + torch::autograd::profiler::ProfilerState::CPU, false, false)); + auto s1 = c10::make_intrusive(IntType::get()); + auto verifyProfilerCb = wrapPropagateTLSState([&profilerEnabledCb] { + // Ensure the profiler is still enabled in this thread. + profilerEnabledCb(); + auto t1 = torch::ones({2, 2}); + auto t2 = torch::ones({2, 2}); + torch::add(t1, t2); + // Don't cleanup TLSState, and just consolidate. + auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true); + auto thread_event_lists = + torch::autograd::profiler::disableProfiler(std::move(opts)); + // Ensure that the events from this thread are still profiled and we obtain + // the expected in events in our consolidated list when calling + // disableProfiler(). + bool found_ones = false; + bool found_add = false; + for (const auto& li : thread_event_lists) { + for (const auto& evt : li) { + if (strcmp(evt.name(), "aten::add") == 0) { + found_add = true; + } else if (strcmp(evt.name(), "aten::ones") == 0) { + found_ones = true; + } + } + if (found_add && found_ones) { + break; + } + } + ASSERT_TRUE(found_ones); + ASSERT_TRUE(found_add); + }); + + s1->addCallback(verifyProfilerCb); + // Disable the profiler, but do not consolidate results in the main thread. + auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false); + torch::autograd::profiler::disableProfiler(std::move(opts)); + std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); }); + t.join(); + + // Similar to above test, but verifies correctness in the case where + // continuation runs on the main thread. + torch::autograd::profiler::enableProfiler( + torch::autograd::profiler::ProfilerConfig( + torch::autograd::profiler::ProfilerState::CPU, false, false)); + s1 = c10::make_intrusive(IntType::get()); + s1->addCallback(verifyProfilerCb); + // Runs callback inline + s1->markCompleted(at::IValue(1)); + opts = torch::autograd::profiler::ProfilerDisableOptions(true, false); + torch::autograd::profiler::disableProfiler(std::move(opts)); +} + +TEST(IValueKWargsTest, Basic) { + const auto text = R"( + def foo(a : int, b : int, c : int = 4): + return a + 2*b + 3*c + )"; + auto cu = compile(text); + auto result = cu->get_function("foo")({1}, {{"b", 3}}); + ASSERT_EQ(result.toInt(), 19); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_mobile_type_parser.cpp b/test/cpp/jit/test_mobile_type_parser.cpp index 989d16794bd2..7e24e5dc65bc 100644 --- a/test/cpp/jit/test_mobile_type_parser.cpp +++ b/test/cpp/jit/test_mobile_type_parser.cpp @@ -1,5 +1,6 @@ -#include "test/cpp/jit/test_base.h" -//#include +#include + +#include namespace c10 { // std::string serializeType(const Type &t); @@ -8,50 +9,74 @@ TypePtr parseType(const std::string& pythonStr); namespace torch { namespace jit { -void testMobileTypeParser() { +TEST(MobileTypeParserTest, Empty) { std::string empty_ps(""); ASSERT_ANY_THROW(c10::parseType(empty_ps)); +} +TEST(MobileTypeParserTest, RoundTripAnnotationStr) { std::string int_ps("int"); auto int_tp = c10::parseType(int_ps); std::string int_tps = int_tp->annotation_str(); ASSERT_EQ(int_ps, int_tps); +} +TEST(MobileTypeParserTest, NestedContainersAnnotationStr) { std::string tuple_ps( "Tuple[str, Optional[float], Dict[str, List[Tensor]], int]"); auto tuple_tp = c10::parseType(tuple_ps); std::string tuple_tps = tuple_tp->annotation_str(); ASSERT_EQ(tuple_ps, tuple_tps); +} +TEST(MobileTypeParserTest, NestedContainersAnnotationStrWithSpaces) { + std::string tuple_ps( + "Tuple[str, Optional[float], Dict[str, List[Tensor]], int]"); std::string tuple_space_ps( "Tuple[ str, Optional[float], Dict[str, List[Tensor ]] , int]"); auto tuple_space_tp = c10::parseType(tuple_space_ps); // tuple_space_tps should not have weird white spaces std::string tuple_space_tps = tuple_space_tp->annotation_str(); ASSERT_EQ(tuple_ps, tuple_space_tps); +} +TEST(MobileTypeParserTest, TypoRaises) { std::string typo_token("List[tensor]"); ASSERT_ANY_THROW(c10::parseType(typo_token)); +} +TEST(MobileTypeParserTest, MismatchBracketRaises) { std::string mismatch1("List[Tensor"); ASSERT_ANY_THROW(c10::parseType(mismatch1)); +} +TEST(MobileTypeParserTest, MismatchBracketRaises2) { std::string mismatch2("List[[Tensor]"); ASSERT_ANY_THROW(c10::parseType(mismatch2)); +} +TEST(MobileTypeParserTest, DictWithoutValueRaises) { std::string mismatch3("Dict[Tensor]"); ASSERT_ANY_THROW(c10::parseType(mismatch3)); +} +TEST(MobileTypeParserTest, ListArgCountMismatchRaises) { // arg count mismatch std::string mismatch4("List[int, str]"); ASSERT_ANY_THROW(c10::parseType(mismatch4)); +} +TEST(MobileTypeParserTest, DictArgCountMismatchRaises) { std::string trailing_commm("Dict[str,]"); ASSERT_ANY_THROW(c10::parseType(trailing_commm)); +} +TEST(MobileTypeParserTest, ValidTypeWithExtraStuffRaises) { std::string extra_stuff("int int"); ASSERT_ANY_THROW(c10::parseType(extra_stuff)); +} +TEST(MobileTypeParserTest, NonIdentifierRaises) { std::string non_id("(int)"); ASSERT_ANY_THROW(c10::parseType(non_id)); } diff --git a/test/cpp/jit/test_module_api.cpp b/test/cpp/jit/test_module_api.cpp index 386addd9fbec..910331166d51 100644 --- a/test/cpp/jit/test_module_api.cpp +++ b/test/cpp/jit/test_module_api.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include @@ -42,7 +43,7 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } -void testModuleClone() { +TEST(ModuleAPITest, Clone) { auto cu = std::make_shared(); // creating child module auto child = ClassType::create("child", cu, true); @@ -71,7 +72,7 @@ void testModuleClone() { ASSERT_EQ(Module(p2.attr("c2").toObject()).attr(attr_name).toInt(), 3); } -void testModuleCloneWithModuleInterface() { +TEST(ModuleAPITest, CloneWithModuleInterface) { auto cu = std::make_shared(); // define a initial module with two submods share same interface @@ -115,7 +116,7 @@ void testModuleCloneWithModuleInterface() { ASSERT_NE(clonedMod.type(), parentMod.type()); } -void testModuleCopy() { +TEST(ModuleAPITest, Copy) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); auto attr_name = "attr"; @@ -144,7 +145,7 @@ void testModuleCopy() { ASSERT_EQ(m3.attr(attr_name).toInt(), 3); } -void testModuleDeepcopy() { +TEST(ModuleAPITest, DeepCopy) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); auto str_attr = "str_attr"; @@ -203,7 +204,7 @@ void testModuleDeepcopy() { ASSERT_TRUE(t1.equal(t3)); } -void testModuleDeepcopyString() { +TEST(ModuleAPITest, DeepCopyString) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); auto attr1 = "attr1"; @@ -219,7 +220,7 @@ void testModuleDeepcopyString() { ASSERT_EQ(copied.attr(attr1).toString()->string(), original_str); } -void testModuleDeepcopyAliasing() { +TEST(ModuleAPITest, DeepCopyPreservesAliasing) { // check deepcopy preserves aliasing auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); @@ -256,7 +257,7 @@ void testModuleDeepcopyAliasing() { ASSERT_TRUE(copied_attr3.isAliasOf(copied_attr4)); } -void testModuleConstant() { +TEST(ModuleAPITest, Constants) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); auto attr_name = "attr"; @@ -272,7 +273,7 @@ void testModuleConstant() { ASSERT_EQ(m.attr(const_name).toInt(), 3); } -void testModuleParameter() { +TEST(ModuleAPITest, Parameters) { auto cu = std::make_shared(); auto cls = ClassType::create("foo.bar", cu, true); Module m(cu, cls); @@ -291,5 +292,39 @@ void testModuleParameter() { ASSERT_TRUE(m.hasattr("none_param2")); } +TEST(ModuleAPITest, Define) { + Module m("m"); + m.register_parameter("foo", torch::ones({}), false); + m.define(R"( + def add_it(self, x, b : int = 4): + return self.foo + x + b + )"); + auto result = m.run_method("add_it", torch::ones({})); + AT_ASSERT(result.toTensor().item() == 6); +} + +TEST(ModuleAPITest, To_CUDA) { + Module m("test"); + { + // test cuda to cpu for params and buffers + m.register_parameter("foo", torch::ones({}, at::kCUDA), false); + m.register_buffer("bar", torch::ones({}, at::kCUDA)); + + m.to(at::kCUDA); + m.to(at::kCPU); + AT_ASSERT(m.attr("foo").toTensor().device().is_cpu()); + AT_ASSERT(m.attr("bar").toTensor().device().is_cpu()); + } + { + // test cpu to cuda for params and buffers + m.register_parameter("foo", torch::ones({}), false); + m.register_buffer("bar", torch::ones({})); + + m.to(at::kCUDA); + AT_ASSERT(m.attr("foo").toTensor().device().is_cuda()); + AT_ASSERT(m.attr("bar").toTensor().device().is_cuda()); + } +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_peephole_optimize.cpp b/test/cpp/jit/test_peephole_optimize.cpp index 5382d556613d..9985faa6e9bd 100644 --- a/test/cpp/jit/test_peephole_optimize.cpp +++ b/test/cpp/jit/test_peephole_optimize.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include @@ -8,47 +9,48 @@ namespace torch { namespace jit { -void testPeepholeOptimize() { - // test is / is not none optimization - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(PeepholeOptimizeTest, IsAndIsNot) +// test is / is not none optimization +{ + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%0 : int): %1 : None = prim::Constant() %2 : bool = aten::__is__(%0, %1) %3 : bool = aten::__isnot__(%0, %1) return (%2, %3) )IR", - graph.get()); - PeepholeOptimize(graph); - testing::FileCheck() - .check_not("aten::__is__") - ->check_not("aten::__isnot__") - ->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + graph.get()); + PeepholeOptimize(graph); + testing::FileCheck() + .check_not("aten::__is__") + ->check_not("aten::__isnot__") + ->run(*graph); +} + +TEST(PeepholeOptimizeTest, IsAndIsNot2) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%0: int?): %1 : None = prim::Constant() %2 : bool = aten::__is__(%0, %1) %3 : bool = aten::__isnot__(%0, %1) return (%2, %3) )IR", - graph.get()); - PeepholeOptimize(graph); - testing::FileCheck() - .check("aten::__is__") - ->check("aten::__isnot__") - ->run(*graph); - } + graph.get()); + PeepholeOptimize(graph); + testing::FileCheck() + .check("aten::__is__") + ->check("aten::__isnot__") + ->run(*graph); +} - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(PeepholeOptimizeTest, IsAndIsNot3) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%0: int?): %1 : Tensor = prim::AutogradZero() %2 : None = prim::Constant() @@ -56,48 +58,49 @@ graph(%0: int?): %5 : bool = aten::__isnot__(%1, %2) return (%4, %5) )IR", - graph.get()); - PeepholeOptimize(graph); - testing::FileCheck() - .check("aten::__is__") - ->check_not("aten::__isnot__") - ->run(*graph); - } + graph.get()); + PeepholeOptimize(graph); + testing::FileCheck() + .check("aten::__is__") + ->check_not("aten::__isnot__") + ->run(*graph); +} - // test unwrap optional - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(PeepholeOptimizeTest, UnwrapOptional) +// test unwrap optional +{ + auto graph = std::make_shared(); + parseIR( + R"IR( graph(): %1 : Float(*, *, *) = prim::Constant() %2 : bool = aten::_unwrap_optional(%1) %3 : bool = prim::unchecked_unwrap_optional(%1) return (%2, %3) )IR", - graph.get()); - PeepholeOptimize(graph); - testing::FileCheck().check_not("unwrap")->run(*graph); - } - { - auto graph = std::make_shared(); - parseIR( - R"IR( + graph.get()); + PeepholeOptimize(graph); + testing::FileCheck().check_not("unwrap")->run(*graph); +} + +TEST(PeepholeOptimizeTest, UnwrapOptional2) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph(%1 : Float(*, *, *)?): %2 : bool = aten::_unwrap_optional(%1) %3 : bool = prim::unchecked_unwrap_optional(%1) return (%2, %3) )IR", - graph.get()); - PeepholeOptimize(graph); - testing::FileCheck().check_count("unwrap", 2)->run(*graph); - } + graph.get()); + PeepholeOptimize(graph); + testing::FileCheck().check_count("unwrap", 2)->run(*graph); +} - // tests addmm fusion - { - auto graph = std::make_shared(); - parseIR( - R"IR( +TEST(PeepholeOptimizeTest, AddMMFusion) { + auto graph = std::make_shared(); + parseIR( + R"IR( graph( %0 : Float(2, 3, 4), %1 : Float(2, 3, 4), @@ -108,10 +111,9 @@ graph(%1 : Float(*, *, *)?): %6 : Tensor = aten::add(%5, %2, %3) return (%6) )IR", - graph.get()); - FuseAddMM(graph); - testing::FileCheck().check("addmm")->run(*graph); - } + graph.get()); + FuseAddMM(graph); + testing::FileCheck().check("addmm")->run(*graph); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_qualified_name.cpp b/test/cpp/jit/test_qualified_name.cpp index 0f387bb542ed..80028ada8565 100644 --- a/test/cpp/jit/test_qualified_name.cpp +++ b/test/cpp/jit/test_qualified_name.cpp @@ -1,68 +1,70 @@ +#include #include #include -#include "test/cpp/jit/test_base.h" using c10::QualifiedName; namespace torch { namespace jit { -void testQualifiedName() { - { - // Test prefix construction - auto foo = QualifiedName("foo"); - auto bar = QualifiedName(foo, "bar"); - auto baz = QualifiedName(bar, "baz"); - ASSERT_EQ(baz.qualifiedName(), "foo.bar.baz"); - ASSERT_EQ(baz.prefix(), "foo.bar"); - ASSERT_EQ(baz.name(), "baz"); - auto nullstate = QualifiedName(); - ASSERT_EQ(nullstate.qualifiedName(), ""); - ASSERT_EQ(nullstate.prefix(), ""); - ASSERT_EQ(nullstate.name(), ""); - } - { - // Test dotted construction - auto foo = QualifiedName("foo.bar.baz"); - ASSERT_EQ(foo.qualifiedName(), "foo.bar.baz"); - ASSERT_EQ(foo.prefix(), "foo.bar"); - ASSERT_EQ(foo.name(), "baz"); +TEST(QualifiedNameTest, PrefixConstruction) { + // Test prefix construction + auto foo = QualifiedName("foo"); + auto bar = QualifiedName(foo, "bar"); + auto baz = QualifiedName(bar, "baz"); + ASSERT_EQ(baz.qualifiedName(), "foo.bar.baz"); + ASSERT_EQ(baz.prefix(), "foo.bar"); + ASSERT_EQ(baz.name(), "baz"); + auto nullstate = QualifiedName(); + ASSERT_EQ(nullstate.qualifiedName(), ""); + ASSERT_EQ(nullstate.prefix(), ""); + ASSERT_EQ(nullstate.name(), ""); +} + +TEST(QualifiedNameTest, DottedConstruction) { + // Test dotted construction + auto foo = QualifiedName("foo.bar.baz"); + ASSERT_EQ(foo.qualifiedName(), "foo.bar.baz"); + ASSERT_EQ(foo.prefix(), "foo.bar"); + ASSERT_EQ(foo.name(), "baz"); + + auto bar = QualifiedName("bar"); + ASSERT_EQ(bar.qualifiedName(), "bar"); + ASSERT_EQ(bar.prefix(), ""); + ASSERT_EQ(bar.name(), "bar"); +} + +TEST(QualifiedNameTest, BadInputRaises) { + // throw some bad inputs at it + ASSERT_ANY_THROW(QualifiedName("foo..bar")); + ASSERT_ANY_THROW(QualifiedName(".foo.bar")); + ASSERT_ANY_THROW(QualifiedName("foo.bar.")); + ASSERT_ANY_THROW(QualifiedName("")); +} + +TEST(QualifiedNameTest, Equality) { + // test equality api + auto foo1 = QualifiedName("foo.bar.baz"); + auto foo2 = QualifiedName("foo.bar.baz"); + auto foo3 = QualifiedName("bar.bar.baz"); + ASSERT_EQ(foo1, foo2); + ASSERT_NE(foo1, foo3); + auto bar1 = QualifiedName("sup"); + auto bar2 = QualifiedName("sup"); + ASSERT_EQ(foo1, foo2); +} - auto bar = QualifiedName("bar"); - ASSERT_EQ(bar.qualifiedName(), "bar"); - ASSERT_EQ(bar.prefix(), ""); - ASSERT_EQ(bar.name(), "bar"); - } - { - // throw some bad inputs at it - ASSERT_ANY_THROW(QualifiedName("foo..bar")); - ASSERT_ANY_THROW(QualifiedName(".foo.bar")); - ASSERT_ANY_THROW(QualifiedName("foo.bar.")); - ASSERT_ANY_THROW(QualifiedName("")); - } - { - // test equality api - auto foo1 = QualifiedName("foo.bar.baz"); - auto foo2 = QualifiedName("foo.bar.baz"); - auto foo3 = QualifiedName("bar.bar.baz"); - ASSERT_EQ(foo1, foo2); - ASSERT_NE(foo1, foo3); - auto bar1 = QualifiedName("sup"); - auto bar2 = QualifiedName("sup"); - ASSERT_EQ(foo1, foo2); - } - { - // test prefix api - auto foo1 = QualifiedName("foo.bar.baz"); - auto foo2 = QualifiedName("foo.bar"); - auto foo3 = QualifiedName("bar.bar.baz"); - auto foo4 = QualifiedName("foo.bar"); - ASSERT_TRUE(foo2.isPrefixOf(foo1)); - ASSERT_TRUE(foo2.isPrefixOf(foo4)); - ASSERT_TRUE(foo4.isPrefixOf(foo2)); - ASSERT_FALSE(foo1.isPrefixOf(foo2)); - ASSERT_FALSE(foo2.isPrefixOf(foo3)); - } +TEST(QualifiedNameTest, IsPrefixOf) { + // test prefix api + auto foo1 = QualifiedName("foo.bar.baz"); + auto foo2 = QualifiedName("foo.bar"); + auto foo3 = QualifiedName("bar.bar.baz"); + auto foo4 = QualifiedName("foo.bar"); + ASSERT_TRUE(foo2.isPrefixOf(foo1)); + ASSERT_TRUE(foo2.isPrefixOf(foo4)); + ASSERT_TRUE(foo4.isPrefixOf(foo2)); + ASSERT_FALSE(foo1.isPrefixOf(foo2)); + ASSERT_FALSE(foo2.isPrefixOf(foo3)); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp index 05940845d172..2e59358b4e00 100644 --- a/test/cpp/jit/test_save_load.cpp +++ b/test/cpp/jit/test_save_load.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include @@ -12,10 +13,10 @@ namespace torch { namespace jit { -// Tests that an extra file written explicitly has precedence over -// extra files written by a hook -// TODO: test for the warning, too -void testExtraFilesHookPreference() { +TEST(SerializationTest, ExtraFilesHookPreference) { + // Tests that an extra file written explicitly has precedence over + // extra files written by a hook + // TODO: test for the warning, too const auto script = R"JIT( def forward(self): x = torch.rand(5, 5) @@ -43,52 +44,50 @@ void testExtraFilesHookPreference() { ASSERT_EQ(loaded_extra_files["metadata.json"], "abc"); } -void testSaveExtraFilesHook() { +TEST(SerializationTest, ExtraFileHooksNoSecret) { // no secrets + std::stringstream ss; + { + Module m("__torch__.m"); + ExtraFilesMap extra; + extra["metadata.json"] = "abc"; + m.save(ss, extra); + } + ss.seekg(0); + { + ExtraFilesMap extra; + extra["metadata.json"] = ""; + extra["secret.json"] = ""; + jit::load(ss, c10::nullopt, extra); + ASSERT_EQ(extra["metadata.json"], "abc"); + ASSERT_EQ(extra["secret.json"], ""); + } +} + +TEST(SerializationTest, ExtraFileHooksWithSecret) { + std::stringstream ss; { - std::stringstream ss; - { - Module m("__torch__.m"); - ExtraFilesMap extra; - extra["metadata.json"] = "abc"; - m.save(ss, extra); - } - ss.seekg(0); - { - ExtraFilesMap extra; - extra["metadata.json"] = ""; - extra["secret.json"] = ""; - jit::load(ss, c10::nullopt, extra); - ASSERT_EQ(extra["metadata.json"], "abc"); - ASSERT_EQ(extra["secret.json"], ""); - } + SetExportModuleExtraFilesHook([](const Module&) -> ExtraFilesMap { + return {{"secret.json", "topsecret"}}; + }); + Module m("__torch__.m"); + ExtraFilesMap extra; + extra["metadata.json"] = "abc"; + m.save(ss, extra); + SetExportModuleExtraFilesHook(nullptr); } - // some secret + ss.seekg(0); { - std::stringstream ss; - { - SetExportModuleExtraFilesHook([](const Module&) -> ExtraFilesMap { - return {{"secret.json", "topsecret"}}; - }); - Module m("__torch__.m"); - ExtraFilesMap extra; - extra["metadata.json"] = "abc"; - m.save(ss, extra); - SetExportModuleExtraFilesHook(nullptr); - } - ss.seekg(0); - { - ExtraFilesMap extra; - extra["metadata.json"] = ""; - extra["secret.json"] = ""; - jit::load(ss, c10::nullopt, extra); - ASSERT_EQ(extra["metadata.json"], "abc"); - ASSERT_EQ(extra["secret.json"], "topsecret"); - } + ExtraFilesMap extra; + extra["metadata.json"] = ""; + extra["secret.json"] = ""; + jit::load(ss, c10::nullopt, extra); + ASSERT_EQ(extra["metadata.json"], "abc"); + ASSERT_EQ(extra["secret.json"], "topsecret"); } } -void testTypeTags() { +TEST(SerializationTest, TypeTags) { auto list = c10::List>(); list.push_back(c10::List({1, 2, 3})); list.push_back(c10::List({4, 5, 6})); diff --git a/test/cpp/jit/test_schema_matching.cpp b/test/cpp/jit/test_schema_matching.cpp index bea7d14dcaf2..aeeb173b2678 100644 --- a/test/cpp/jit/test_schema_matching.cpp +++ b/test/cpp/jit/test_schema_matching.cpp @@ -1,8 +1,9 @@ +#include + #include +#include #include #include -#include "test/cpp/jit/test_base.h" -#include "torch/csrc/jit/runtime/custom_operator.h" #include #include @@ -10,80 +11,79 @@ namespace torch { namespace jit { -void testSchemaMatching() { - { - RegisterOperators reg({ - Operator( - "aten::test_vartype(t[] a, t b) -> (t)", - [](Stack* stack) { - c10::List list; - double a; - pop(stack, list, a); - push(stack, a); - }, - c10::AliasAnalysisKind::FROM_SCHEMA), - }); - Module m("m"); - m.define(R"( +TEST(SchemaMatchingTest, VarType) { + RegisterOperators reg({ + Operator( + "aten::test_vartype(t[] a, t b) -> (t)", + [](Stack* stack) { + c10::List list; + double a; + pop(stack, list, a); + push(stack, a); + }, + c10::AliasAnalysisKind::FROM_SCHEMA), + }); + Module m("m"); + m.define(R"( def test(self): a = (1.0, 2.0) return torch.test_vartype(a, 2.0) )"); - auto result = m.run_method("test"); - TORCH_INTERNAL_ASSERT(result.toDouble() == 2.0); + auto result = m.run_method("test"); + TORCH_INTERNAL_ASSERT(result.toDouble() == 2.0); - const std::string error_example = R"JIT( + const std::string error_example = R"JIT( def test_2(self): a = (1.0, 2.0) non_float = (1, 1) return torch.test_vartype(a, non_float) )JIT"; - std::string err = ""; - try { - m.define(error_example); - } catch (const std::exception& e) { - err = e.what(); - } - TORCH_INTERNAL_ASSERT( - err.find("previously matched to type") != std::string::npos); + std::string err = ""; + try { + m.define(error_example); + } catch (const std::exception& e) { + err = e.what(); } - { - RegisterOperators reg({ - Operator( - "aten::test_vartype2(t a, t[] b) -> (t[])", - [](Stack* stack) { - double a; - c10::List list; - pop(stack, a, list); - push(stack, a); - }, - AliasAnalysisKind::FROM_SCHEMA), - }); - Module m("m"); - m.define(R"JIT( + TORCH_INTERNAL_ASSERT( + err.find("previously matched to type") != std::string::npos); +} + +TEST(SchemaMatchingTest, VarType2) { + RegisterOperators reg({ + Operator( + "aten::test_vartype2(t a, t[] b) -> (t[])", + [](Stack* stack) { + double a; + c10::List list; + pop(stack, a, list); + push(stack, a); + }, + AliasAnalysisKind::FROM_SCHEMA), + }); + Module m("m"); + m.define(R"JIT( def test(self): a = (1.0, 2.0) return torch.test_vartype2(3.0, a) )JIT"); - auto result = m.run_method("test"); - TORCH_INTERNAL_ASSERT(result.toDouble() == 3.0); + auto result = m.run_method("test"); + TORCH_INTERNAL_ASSERT(result.toDouble() == 3.0); - static const auto error_exam2 = R"JIT( + static const auto error_exam2 = R"JIT( def test_2(self): a = (1, 2) return torch.test_vartype2(3.0, a) )JIT"; - std::string err = ""; - try { - m.define(error_exam2); - } catch (const std::exception& e) { - err = e.what(); - } - TORCH_INTERNAL_ASSERT( - err.find("previously matched to type") != std::string::npos); + std::string err = ""; + try { + m.define(error_exam2); + } catch (const std::exception& e) { + err = e.what(); } + TORCH_INTERNAL_ASSERT( + err.find("previously matched to type") != std::string::npos); } } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_subgraph_matcher.cpp b/test/cpp/jit/test_subgraph_matcher.cpp index 2e398db44e95..39078d345269 100644 --- a/test/cpp/jit/test_subgraph_matcher.cpp +++ b/test/cpp/jit/test_subgraph_matcher.cpp @@ -1,11 +1,12 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/ir/subgraph_matcher.h" namespace torch { namespace jit { -void testTrivial1() { +TEST(SubgraphMatcherTest, Trivial1) { Graph graph, pattern; parseIR( R"IR( @@ -22,7 +23,7 @@ graph(%0): AT_ASSERT(!findPatternMatches(pattern, graph).empty()); } -void testTrivial2() { +TEST(SubgraphMatcherTest, Trivial2) { Graph graph; auto* g_in = graph.addInput(); auto* g_tanh = graph.insertNode(graph.create(aten::tanh, /*num_outputs =*/1)); @@ -45,7 +46,7 @@ void testTrivial2() { } } -void testTrivial3() { +TEST(SubgraphMatcherTest, Trivial3) { Graph graph, pattern; parseIR( R"IR( @@ -64,7 +65,7 @@ graph(%a, %b): AT_ASSERT(!findPatternMatches(pattern, graph).empty()); } -void testTrivial4() { +TEST(SubgraphMatcherTest, Trivial4) { Graph graph; auto* g_in0 = graph.addInput(); auto* g_in1 = graph.addInput(); @@ -92,7 +93,7 @@ void testTrivial4() { } } -void testLinear1() { +TEST(SubgraphMatcherTest, Linear1) { Graph graph, pattern; parseIR( R"IR( @@ -114,7 +115,7 @@ graph(%0): AT_ASSERT(!findPatternMatches(pattern, graph).empty()); } -void testLinear2() { +TEST(SubgraphMatcherTest, Linear2) { Graph graph; auto* g_in = graph.addInput(); @@ -164,7 +165,7 @@ void testLinear2() { * | * eee */ -void testDiamond1() { +TEST(SubgraphMatcherTest, Diamond1) { Graph graph, pattern1, pattern2; parseIR( R"IR( @@ -215,7 +216,7 @@ graph(%0): * | * o1 */ -void testDiamond2() { +TEST(SubgraphMatcherTest, Diamond2) { Graph graph; auto* g_in = graph.addInput(); @@ -253,7 +254,7 @@ void testDiamond2() { } } -void testXPattern() { +TEST(SubgraphMatcherTest, XPattern) { Graph graph, pattern; parseIR( R"IR( @@ -280,7 +281,7 @@ graph(%0, %1): AT_ASSERT(!findPatternMatches(pattern, graph).empty()); } -void testMultipleMatches() { +TEST(SubgraphMatcherTest, MultipleMatches) { Graph graph, pattern; parseIR( R"IR( @@ -301,7 +302,7 @@ graph(%t0): AT_ASSERT(matches.size() == 4); } -void testOverlappingMatches() { +TEST(SubgraphMatcherTest, OverlappingMatches) { Graph graph, pattern; parseIR( R"IR( @@ -323,7 +324,7 @@ graph(%t0): AT_ASSERT(matches.size() == 3); } -void testMatchInBasicBlocks1() { +TEST(SubgraphMatcherTest, MatchInBasicBlocks1) { Graph graph; parseIR( R"IR( @@ -360,7 +361,7 @@ graph(%x, %y): AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0); } -void testMatchInBasicBlocks2() { +TEST(SubgraphMatcherTest, MatchInBasicBlocks2) { Graph graph; parseIR( R"IR( @@ -395,7 +396,7 @@ graph(%x, %y): AT_ASSERT(findPatternMatches(pattern1, graph).size() == 0); } -void testMatchesAttributes() { +TEST(SubgraphMatcherTest, MatchesAttributes) { Graph graph; parseIR( R"IR( @@ -479,7 +480,7 @@ graph(%a, %b): } } -void testBadPattern() { +TEST(SubgraphMatcherTest, BadPattern) { Graph graph, pattern1, pattern2; parseIR( R"IR( @@ -509,23 +510,5 @@ graph(%x): ASSERT_ANY_THROW(findPatternMatches(pattern2, graph)); } -void testSubgraphMatching() { - testTrivial1(); - testTrivial2(); - testTrivial3(); - testTrivial4(); - testLinear1(); - testLinear2(); - testDiamond1(); - testDiamond2(); - testXPattern(); - testMultipleMatches(); - testOverlappingMatches(); - testMatchInBasicBlocks1(); - testMatchInBasicBlocks2(); - testMatchesAttributes(); - testBadPattern(); -} - } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_subgraph_rewriter.cpp b/test/cpp/jit/test_subgraph_rewriter.cpp index 9799dfdb97b2..f166962ebc5c 100644 --- a/test/cpp/jit/test_subgraph_rewriter.cpp +++ b/test/cpp/jit/test_subgraph_rewriter.cpp @@ -1,4 +1,5 @@ -#include +#include + #include #include #include @@ -8,7 +9,7 @@ namespace torch { namespace jit { using namespace testing; -void testFilterMatch() { +TEST(SubgraphRewriterTest, FilterMatch) { auto graph = std::make_shared(); parseIR( @@ -80,7 +81,7 @@ graph(%a, %b): } } -void testFilterNoMatch() { +TEST(SubgraphRewriterTest, FilterNoMatch) { auto graph = std::make_shared(); parseIR( R"IR( @@ -121,10 +122,5 @@ graph(%a, %b): FileCheck().check("c::ccc")->check_not("d::ddd")->run(*graph); } -void testSubgraphRewriter() { - testFilterMatch(); - testFilterNoMatch(); -} - } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_subgraph_utils.cpp b/test/cpp/jit/test_subgraph_utils.cpp index e1f86cc34979..09e01f8836da 100644 --- a/test/cpp/jit/test_subgraph_utils.cpp +++ b/test/cpp/jit/test_subgraph_utils.cpp @@ -1,4 +1,5 @@ -#include "test/cpp/jit/test_base.h" +#include + #include "test/cpp/jit/test_utils.h" #include "torch/csrc/jit/passes/common_subexpression_elimination.h" @@ -7,7 +8,7 @@ namespace torch { namespace jit { -void testSubgraphUtils() { +TEST(SubgraphUtilsTest, Basic) { auto graph = build_lstm(); EliminateCommonSubexpression(graph); @@ -37,7 +38,7 @@ void testSubgraphUtils() { ASSERT_EQ(originalNodes.size(), newNodes.size()); } -void testSubgraphUtilsVmap() { +TEST(SubgraphUtilsTest, Vmap) { auto graph = std::make_shared(); std::unordered_map parse_map; diff --git a/test/cpp/jit/test_utils.cpp b/test/cpp/jit/test_utils.cpp index d87e8201615d..6f626756db74 100644 --- a/test/cpp/jit/test_utils.cpp +++ b/test/cpp/jit/test_utils.cpp @@ -1,6 +1,9 @@ +#include + #include #include #include +#include namespace torch { namespace jit { @@ -137,5 +140,22 @@ std::pair lstm( return {hy, cy}; } +inline c10::AliasAnalysisKind aliasAnalysisFromSchema() { + return c10::AliasAnalysisKind::FROM_SCHEMA; +} + +namespace { +RegisterOperators reg({ + // This operator is intended to be used in JIT analysis and transformation + // pass unit tests in which Values with type Tensor are often required. It + // should not be used in situations in which the graph is actually executed + // because it always produces empty Tensors. + Operator( + "prim::MakeTestTensor() -> Tensor", + [](Stack* stack) { push(stack, at::Tensor()); }, + aliasAnalysisFromSchema()), +}); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/jit/test_utils.h b/test/cpp/jit/test_utils.h index 6e6b82fff442..109f7253deea 100644 --- a/test/cpp/jit/test_utils.h +++ b/test/cpp/jit/test_utils.h @@ -1,7 +1,6 @@ #pragma once #include -#include "test/cpp/jit/test_base.h" #include "torch/csrc/jit/ir/irparser.h" #include "torch/csrc/jit/runtime/autodiff.h" #include "torch/csrc/jit/runtime/interpreter.h" diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h deleted file mode 100644 index df45054edc43..000000000000 --- a/test/cpp/jit/tests.h +++ /dev/null @@ -1,246 +0,0 @@ -#pragma once - -/** - * See README.md for instructions on how to add a new test. - */ -#include -#include - -namespace torch { -namespace jit { -#define TH_FORALL_TESTS(_) \ - _(ADFormulas) \ - _(Attributes) \ - _(Blocks) \ - _(CallStack) \ - _(CallStackCaching) \ - _(CodeTemplate) \ - _(ControlFlow) \ - _(CreateAutodiffSubgraphs) \ - _(CustomOperators) \ - _(CustomOperatorAliasing) \ - _(TemplatedOperatorCreator) \ - _(IValueKWargs) \ - _(CustomFusion) \ - _(SchemaMatching) \ - _(Differentiate) \ - _(DifferentiateWithRequiresGrad) \ - _(FromQualString) \ - _(InternedStrings) \ - _(PassManagement) \ - _(Proto) \ - _(RegisterFusionCachesKernel) \ - _(SchemaParser) \ - _(TopologicalIndex) \ - _(SubgraphUtils) \ - _(SubgraphUtilsVmap) \ - _(IRParser) \ - _(ConstantPooling) \ - _(CleanUpPasses) \ - _(THNNConv) \ - _(ATenNativeBatchNorm) \ - _(NoneSchemaMatch) \ - _(ClassParser) \ - _(UnifyTypes) \ - _(Profiler) \ - _(FallbackGraphs) \ - _(InsertAndEliminateRedundantGuards) \ - _(LoopPeeler) \ - _(InsertBailOuts) \ - _(PeepholeOptimize) \ - _(RecordFunction) \ - _(ThreadLocalDebugInfo) \ - _(SubgraphMatching) \ - _(SubgraphRewriter) \ - _(ModuleClone) \ - _(ModuleConstant) \ - _(ModuleParameter) \ - _(ModuleCopy) \ - _(ModuleDeepcopy) \ - _(ModuleDeepcopyString) \ - _(ModuleDeepcopyAliasing) \ - _(ModuleDefine) \ - _(QualifiedName) \ - _(ClassImport) \ - _(ScriptObject) \ - _(ExtraFilesHookPreference) \ - _(SaveExtraFilesHook) \ - _(TypeTags) \ - _(DCE) \ - _(CustomFusionNestedBlocks) \ - _(ClassDerive) \ - _(SaveLoadTorchbind) \ - _(ModuleInterfaceSerialization) \ - _(ModuleCloneWithModuleInterface) \ - _(ClassTypeAddRemoveAttr) \ - _(Inliner) \ - _(LiteInterpreterAdd) \ - _(LiteInterpreterConv) \ - _(LiteInterpreterInline) \ - _(LiteInterpreterTuple) \ - _(LiteInterpreterUpsampleNearest2d) \ - _(CommonAncestor) \ - _(AutogradSymbols) \ - _(DefaultArgTypeHinting) \ - _(Futures) \ - _(TLSFutureCallbacks) \ - _(MobileTypeParser) \ - _(LiteInterpreterBuiltinFunction) \ - _(LiteInterpreterPrim) \ - _(LiteInterpreterLoadOrigJit) \ - _(LiteInterpreterWrongMethodName) \ - _(LiteInterpreterParams) \ - _(LiteInterpreterSetState) \ - _(LiteInterpreterModuleInfoBasic) \ - _(LiteInterpreterNotSavingModuleInfo) \ - _(LiteInterpreterOneSubmoduleModuleInfo) \ - _(LiteInterpreterTwoSubmodulesModuleInfo) \ - _(LiteInterpreterSequentialModuleInfo) \ - _(LiteInterpreterHierarchyModuleInfo) \ - _(LiteInterpreterDuplicatedClassTypeModuleInfo) \ - _(LiteInterpreterEval) \ - _(TorchbindIValueAPI) \ - _(LiteInterpreterDict) \ - _(LiteInterpreterFindAndRunMethod) \ - _(LiteInterpreterFindWrongMethodName) \ - _(MobileNamedParameters) \ - _(MobileSaveLoadData) \ - _(MobileSaveLoadParameters) \ - _(MobileSaveLoadParametersEmpty) \ - _(LiteSGD) \ - _(LiteSequentialSampler) \ - _(FusionAliasing) - -#if defined(USE_CUDA) -#define TH_FORALL_TESTS_CUDA(_) \ - _(ArgumentSpec) \ - _(CompleteArgumentSpec) \ - _(Fusion) \ - _(GraphExecutor) \ - _(ModuleConversion) \ - _(Interp) \ - _(TypeCheck) \ - _(GPU_IrGraphGenerator) \ - _(GPU_FusionDispatch) \ - _(GPU_FusionClear) \ - _(GPU_FusionCopy) \ - _(GPU_FusionMove) \ - _(GPU_FusionSimpleArith) \ - _(GPU_FusionExprEvalConstants) \ - _(GPU_FusionExprEvalBindings) \ - _(GPU_FusionExprEvalBasic) \ - _(GPU_FusionExprEvalComplex) \ - _(GPU_FusionExprEvalPostLower) \ - _(GPU_FusionSimpleTypePromote) \ - _(GPU_FusionMutator) \ - _(GPU_FusionRegister) \ - _(GPU_FusionTopoSort) \ - _(GPU_FusionTensor) \ - _(GPU_FusionFilterVals) \ - _(GPU_FusionTVSplit) \ - _(GPU_FusionTVMerge) \ - _(GPU_FusionTVReorder) \ - _(GPU_FusionEquality) \ - _(GPU_FusionParser) \ - _(GPU_FusionDependency) \ - _(GPU_FusionCodeGen) \ - _(GPU_FusionCodeGen2) \ - _(GPU_FusionSimplePWise) \ - _(GPU_FusionExecKernel) \ - _(GPU_FusionForLoop) \ - _(GPU_FusionLoopUnroll) \ - _(GPU_FusionUnaryOps) \ - _(GPU_FusionBinaryOps) \ - _(GPU_FusionTernaryOps) \ - _(GPU_FusionCompoundOps) \ - _(GPU_FusionCastOps) \ - _(GPU_FusionAdvancedComputeAt) \ - _(GPU_FusionScalarInputs) \ - _(GPU_FusionRFactorReplay) \ - _(GPU_FusionReduction) \ - _(GPU_FusionReduction2) \ - _(GPU_FusionReduction3) \ - _(GPU_FusionReduction4) \ - _(GPU_FusionReduction5) \ - _(GPU_FusionReductionTFT) \ - _(GPU_FusionSimpleBCast) \ - _(GPU_FusionComplexBCast) \ - _(GPU_FusionAdvancedIndexing) \ - _(GPU_FusionSimpleGemm) \ - _(GPU_FusionSoftmax1D) \ - _(GPU_FusionSoftmax1DNormalized) \ - _(GPU_FusionSoftmax3D) \ - _(GPU_FusionSoftmax3DNormalized) \ - _(GPU_FusionSoftmaxComputeAt) \ - _(GPU_FusionGridReduction1) \ - _(GPU_FusionGridReduction2) \ - _(GPU_FusionGridReduction3dim1) \ - _(GPU_FusionGridReduction3dim0) \ - _(GPU_FusionGridReduction4) \ - _(GPU_FusionGridReduction5) \ - _(GPU_FusionGridReduction6) \ - _(GPU_FusionNonRedAxisBind) \ - _(GPU_FusionBCastInnerDim) \ - _(GPU_FusionBCastReduce) \ - _(GPU_FusionSplitBCast) \ - _(GPU_FusionComputeAtExprOrder) \ - _(GPU_FusionZeroDimComputeAt) \ - _(GPU_FusionZeroDimBroadcast) \ - _(GPU_FusionZeroDimReduction) \ - _(GPU_FusionReductionMultiConsumer) \ - _(GPU_FusionBCastAfterReduce) \ - _(GPU_FusionReductionScheduler) \ - _(GPU_FusionReductionSchedulerMultiDimNonFastest) \ - _(GPU_FusionReductionSchedulerMultiDimFastest) \ - _(GPU_FusionReductionSchedulerDimShmoo) \ - _(GPU_FusionCacheBefore) \ - _(GPU_FusionCacheAfter) \ - _(GPU_FusionCacheIndirect) \ - _(GPU_FusionCacheBcast) \ - _(GPU_FusionCacheComplex) \ - _(GPU_FusionCacheMultiConsumer) \ - _(GPU_FusionSmem) \ - _(GPU_FusionSmemReduce) \ - _(GPU_FusionSmemBlockGemm) \ - _(GPU_FusionSmemBlockGemmCache) \ - _(GPU_FusionConstCheck) \ - _(GPU_FusionSymbolicReduction) \ - _(GPU_FusionUnrollWithAlloc) \ - _(GPU_FusionIsZeroInt) \ - _(GPU_FusionIsOneInt) \ - _(GPU_FusionComputeAtNonterminatingOutput) \ - _(GPU_FusionTraversalOrder1) \ - _(GPU_FusionTraversalOrder2) \ - _(GPU_FusionTraversalOrder3) \ - _(GPU_FusionTraversalOrder4) \ - _(GPU_FusionTraversalOrder5) \ - _(GPU_FusionTraversalOrder6) \ - _(GPU_FusionTraversalOrder7) \ - _(GPU_FusionBranches) \ - _(GPU_FusionThreadPredicate) -#else -#define TH_FORALL_TESTS_CUDA(_) \ - _(ArgumentSpec) \ - _(CompleteArgumentSpec) \ - _(Fusion) \ - _(GraphExecutor) \ - _(ModuleConversion) \ - _(Interp) \ - _(TypeCheck) -#endif - -#define DECLARE_JIT_TEST(name) void test##name(); -TH_FORALL_TESTS(DECLARE_JIT_TEST) -TH_FORALL_TESTS_CUDA(DECLARE_JIT_TEST) -#undef DECLARE_JIT_TEST - -// This test is special since it requires prior setup in python. -// So it is not part of the general test list (which is shared between the gtest -// and python test runners), but is instead invoked manually by the -// torch_python_test.cpp -void testEvalModeForLoadedModule(); -void testSerializationInterop(); -void testTorchSaveError(); - -} // namespace jit -} // namespace torch diff --git a/test/cpp/rpc/test_e2e_process_group.cpp b/test/cpp/rpc/test_e2e_process_group.cpp index d509a4606fa1..7c5af57d6a09 100644 --- a/test/cpp/rpc/test_e2e_process_group.cpp +++ b/test/cpp/rpc/test_e2e_process_group.cpp @@ -19,6 +19,7 @@ class TestE2EProcessGroup : public TestE2EBase { options.devices.push_back( ::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress)); std::chrono::milliseconds rpcTimeout(30000); + options.timeout = rpcTimeout; // Initialize server rpc agent. auto pg = diff --git a/test/cpp/tensorexpr/CMakeLists.txt b/test/cpp/tensorexpr/CMakeLists.txt index a2922045adff..af4299e395cd 100644 --- a/test/cpp/tensorexpr/CMakeLists.txt +++ b/test/cpp/tensorexpr/CMakeLists.txt @@ -12,29 +12,45 @@ add_executable(test_tensorexpr target_link_libraries(test_tensorexpr PRIVATE torch gtest) target_include_directories(test_tensorexpr PRIVATE ${ATen_CPU_INCLUDE}) +add_executable(tutorial_tensorexpr ${TENSOREXPR_TEST_ROOT}/tutorial.cpp) +target_link_libraries(tutorial_tensorexpr PRIVATE torch) +target_include_directories(tutorial_tensorexpr PRIVATE ${ATen_CPU_INCLUDE}) + + if(USE_CUDA) target_link_libraries(test_tensorexpr PRIVATE ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) - target_compile_definitions(test_tensorexpr PRIVATE USE_CUDA) + + target_link_libraries(tutorial_tensorexpr PRIVATE + ${CUDA_LIBRARIES} + ${CUDA_NVRTC_LIB} + ${CUDA_CUDA_LIB} + ${TORCH_CUDA_LIBRARIES}) + target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA) elseif(USE_ROCM) target_link_libraries(test_tensorexpr PRIVATE ${ROCM_HIPRTC_LIB} ${PYTORCH_HIP_HCC_LIBRARIES} ${TORCH_CUDA_LIBRARIES}) - - target_link_libraries(test_tensorexpr PRIVATE caffe2_gpu) - target_compile_definitions(test_tensorexpr PRIVATE USE_ROCM) + + target_link_libraries(tutorial_tensorexpr PRIVATE + ${ROCM_HIPRTC_LIB} + ${PYTORCH_HIP_HCC_LIBRARIES} + ${TORCH_CUDA_LIBRARIES}) + target_compile_definitions(tutorial_tensorexpr PRIVATE USE_ROCM) endif() if(INSTALL_TEST) install(TARGETS test_tensorexpr DESTINATION bin) + install(TARGETS tutorial_tensorexpr DESTINATION bin) # Install PDB files for MSVC builds if(MSVC AND BUILD_SHARED_LIBS) install(FILES $ DESTINATION bin OPTIONAL) + install(FILES $ DESTINATION bin OPTIONAL) endif() endif() diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp index 3ccc484c8420..ca642d1db96e 100644 --- a/test/cpp/tensorexpr/test_aten.cpp +++ b/test/cpp/tensorexpr/test_aten.cpp @@ -15,13 +15,13 @@ using namespace torch::jit::tensorexpr; void testATen_cast_Float() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); + ExprHandle load_a = a_buf.load(index); ExprHandle to_float = Cast::make(kFloat, load_a); - Stmt* store_b = Store::make(b_buf, {index}, to_float, 1); + Stmt* store_b = b_buf.store({index}, to_float); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -43,13 +43,13 @@ void testATen_cast_Float() { void testATennegInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); + ExprHandle load_a = a_buf.load(index); ExprHandle to_float = Sub::make(0, load_a); - Stmt* store_b = Store::make(b_buf, {index}, to_float, 1); + Stmt* store_b = b_buf.store({index}, to_float); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -71,13 +71,13 @@ void testATennegInt() { void testATennegFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); + ExprHandle load_a = a_buf.load(index); ExprHandle to_float = Sub::make(0, load_a); - Stmt* store_b = Store::make(b_buf, {index}, to_float, 1); + Stmt* store_b = b_buf.store({index}, to_float); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -99,16 +99,16 @@ void testATennegFloat() { void testATenaddInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - Stmt* store_d = Store::make(d_buf, {index}, load_a + load_b * load_c, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + Stmt* store_d = d_buf.store({index}, load_a + load_b * load_c); Stmt* stmt = For::make(index, 0, kTotalSize, store_d); PaddedBuffer a_v(kTotalSize); @@ -136,16 +136,16 @@ void testATenaddInt() { void testATenaddFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - Stmt* store_d = Store::make(d_buf, {index}, load_a + load_b * load_c, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + Stmt* store_d = d_buf.store({index}, load_a + load_b * load_c); Stmt* stmt = For::make(index, 0, kTotalSize, store_d); PaddedBuffer a_v(kTotalSize); @@ -173,16 +173,16 @@ void testATenaddFloat() { void testATensubInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - Stmt* store_d = Store::make(d_buf, {index}, load_a - load_b * load_c, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + Stmt* store_d = d_buf.store({index}, load_a - load_b * load_c); Stmt* stmt = For::make(index, 0, kTotalSize, store_d); PaddedBuffer a_v(kTotalSize); @@ -210,16 +210,16 @@ void testATensubInt() { void testATensubFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - Stmt* store_d = Store::make(d_buf, {index}, load_a - load_b * load_c, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + Stmt* store_d = d_buf.store({index}, load_a - load_b * load_c); Stmt* stmt = For::make(index, 0, kTotalSize, store_d); PaddedBuffer a_v(kTotalSize); @@ -247,17 +247,16 @@ void testATensubFloat() { void testATenlerp() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - Stmt* store_d = - Store::make(d_buf, {index}, load_a + load_c * (load_b - load_a), 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + Stmt* store_d = d_buf.store({index}, load_a + load_c * (load_b - load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_d); PaddedBuffer a_v(kTotalSize); @@ -285,19 +284,18 @@ void testATenlerp() { void testATenaddcmulInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt)); - Buffer e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kInt)); + Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - ExprHandle load_d = Load::make(d_buf, {index}, 1); - Stmt* store_e = - Store::make(e_buf, {index}, load_a + load_b * load_c * load_d, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + ExprHandle load_d = d_buf.load(index); + Stmt* store_e = e_buf.store({index}, load_a + load_b * load_c * load_d); Stmt* stmt = For::make(index, 0, kTotalSize, store_e); PaddedBuffer a_v(kTotalSize); @@ -328,19 +326,18 @@ void testATenaddcmulInt() { void testATenaddcmulFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); - Buffer d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); - Buffer e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder d_buf(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder e_buf(BufHandle("E", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - ExprHandle load_c = Load::make(c_buf, {index}, 1); - ExprHandle load_d = Load::make(d_buf, {index}, 1); - Stmt* store_e = - Store::make(e_buf, {index}, load_a + load_b * load_c * load_d, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + ExprHandle load_c = c_buf.load(index); + ExprHandle load_d = d_buf.load(index); + Stmt* store_e = e_buf.store({index}, load_a + load_b * load_c * load_d); Stmt* stmt = For::make(index, 0, kTotalSize, store_e); PaddedBuffer a_v(kTotalSize); @@ -371,14 +368,14 @@ void testATenaddcmulFloat() { void testATenmulInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = Store::make(c_buf, {index}, load_a * load_b, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, load_a * load_b); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -403,14 +400,14 @@ void testATenmulInt() { void testATenmulFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = Store::make(c_buf, {index}, load_a * load_b, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, load_a * load_b); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -435,14 +432,14 @@ void testATenmulFloat() { void testATendivInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = Store::make(c_buf, {index}, load_a / load_b, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, load_a / load_b); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -467,14 +464,14 @@ void testATendivInt() { void testATendivFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = Store::make(c_buf, {index}, load_a / load_b, 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, load_a / load_b); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -499,15 +496,14 @@ void testATendivFloat() { void testATenmaxInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = - Store::make(c_buf, {index}, Max::make(load_a, load_b, true), 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, Max::make(load_a, load_b, true)); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -532,15 +528,14 @@ void testATenmaxInt() { void testATenmaxFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = - Store::make(c_buf, {index}, Max::make(load_a, load_b, true), 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, Max::make(load_a, load_b, true)); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -565,15 +560,14 @@ void testATenmaxFloat() { void testATenminInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = - Store::make(c_buf, {index}, Min::make(load_a, load_b, true), 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, Min::make(load_a, load_b, true)); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -598,15 +592,14 @@ void testATenminInt() { void testATenminFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - ExprHandle load_b = Load::make(b_buf, {index}, 1); - Stmt* store_c = - Store::make(c_buf, {index}, Min::make(load_a, load_b, true), 1); + ExprHandle load_a = a_buf.load(index); + ExprHandle load_b = b_buf.load(index); + Stmt* store_c = c_buf.store({index}, Min::make(load_a, load_b, true)); Stmt* stmt = For::make(index, 0, kTotalSize, store_c); PaddedBuffer a_v(kTotalSize); @@ -631,12 +624,12 @@ void testATenminFloat() { void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, FloatImm::make(1.0f) / load_a, 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, FloatImm::make(1.0f) / load_a); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -658,12 +651,12 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() { void testATenreluInt() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kInt)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kInt)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, Max::make(load_a, 0, false), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, Max::make(load_a, 0, false)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -685,16 +678,14 @@ void testATenreluInt() { void testATenreluFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make( - b_buf, - {index}, - Max::make(load_a, 0, false), // relu does not propagate nans - 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store( + {index}, Max::make(load_a, 0, false) // relu does not propagate nans + ); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -716,12 +707,12 @@ void testATenreluFloat() { void testATenlogFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, log(load_a), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, log(load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -743,12 +734,12 @@ void testATenlogFloat() { void testATenlog10Float() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, log10(load_a), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, log10(load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -770,12 +761,12 @@ void testATenlog10Float() { void testATenlog2Float() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, log2(load_a), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, log2(load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -797,12 +788,12 @@ void testATenlog2Float() { void testATenexpFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, exp(load_a), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, exp(load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -824,12 +815,12 @@ void testATenexpFloat() { void testATenerfFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, erf(load_a), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, erf(load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -851,12 +842,12 @@ void testATenerfFloat() { void testATencosFloat() { KernelScope kernel_scope; const int kTotalSize = 128; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make(a_buf, {index}, 1); - Stmt* store_b = Store::make(b_buf, {index}, cos(load_a), 1); + ExprHandle load_a = a_buf.load(index); + Stmt* store_b = b_buf.store({index}, cos(load_a)); Stmt* stmt = For::make(index, 0, kTotalSize, store_b); PaddedBuffer a_v(kTotalSize); @@ -878,27 +869,22 @@ void testATencosFloat() { void testATeneqInt() { KernelScope kernel_scope; constexpr int N = 128; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 1); std::vector b_buffer(N, 1); std::vector c_buffer(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kEQ), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kEQ))); SimpleIREvaluator ir_eval(memcpy_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); @@ -909,27 +895,22 @@ void testATeneqInt() { void testATengeInt() { KernelScope kernel_scope; constexpr int N = 128; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 5); std::vector b_buffer(N, 5); std::vector c_buffer(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kGE), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kGE))); SimpleIREvaluator ir_eval(memcpy_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); @@ -940,27 +921,22 @@ void testATengeInt() { void testATengtInt() { KernelScope kernel_scope; constexpr int N = 128; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 6); std::vector b_buffer(N, 3); std::vector c_buffer(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kGT), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kGT))); SimpleIREvaluator ir_eval(memcpy_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); @@ -971,27 +947,22 @@ void testATengtInt() { void testATenleInt() { KernelScope kernel_scope; constexpr int N = 128; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 5); std::vector b_buffer(N, 5); std::vector c_buffer(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kLE), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kLE))); SimpleIREvaluator ir_eval(memcpy_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); @@ -1002,27 +973,22 @@ void testATenleInt() { void testATenltInt() { KernelScope kernel_scope; constexpr int N = 128; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 5); std::vector b_buffer(N, 5); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kLT), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kLT))); SimpleIREvaluator ir_eval(memcpy_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp index 98d3d4127da8..11c1c34f24a1 100644 --- a/test/cpp/tensorexpr/test_boundsinference.cpp +++ b/test/cpp/tensorexpr/test_boundsinference.cpp @@ -7,9 +7,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -49,9 +47,9 @@ void testBoundsInference_1() { // {{b, kStore, 0, 99}, {a, kLoad, 0, 99}} KernelScope kernel_scope; ExprHandle n(100); - Buffer a(BufHandle("a", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); Tensor* b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); }); + Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -74,9 +72,9 @@ void testBoundsInference_2() { // {{b, kStore, 0, n-1}, {a, kLoad, 0, n-1}} KernelScope kernel_scope; VarHandle n("n", kInt); - Buffer a(BufHandle("a", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); Tensor* b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); }); + Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -99,9 +97,10 @@ void testBoundsInference_3() { // {{b, kStore, 0, 99}, {a, kLoad, 0, 109}} KernelScope kernel_scope; ExprHandle n(100); - Buffer a(BufHandle("a", {n + 10}, kFloat)); - Tensor* b = Compute( - "b", {{n, "i"}}, [&](const VarHandle& i) { return a(i) * a(i + 10); }); + Placeholder a(BufHandle("a", {n + 10}, kFloat)); + Tensor* b = Compute("b", {{n, "i"}}, [&](const VarHandle& i) { + return a.load(i) * a.load(i + 10); + }); LoopNest l({b}); auto bounds_info = inferBounds(l.root_stmt()); @@ -128,14 +127,14 @@ void testBoundsInference_4() { KernelScope kernel_scope; ExprHandle W(320); ExprHandle H(200); - Buffer a(BufHandle("a", {H, W}, kFloat)); + Placeholder a(BufHandle("a", {H, W}, kFloat)); Tensor* b = Compute( "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { return x * y; }); Tensor* c = Compute( "c", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return a(y, x) * b->call(y, x); + return a.load(y, x) * b->call(y, x); }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); @@ -207,9 +206,9 @@ void testBoundsInference_5() { // b[i_tail + (100/16)*16] = a[i_tail + (100/16)*16]; KernelScope kernel_scope; ExprHandle n(100); - Buffer a(BufHandle("a", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); Tensor* b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); }); + Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); For* outer; @@ -260,14 +259,14 @@ void testBoundsInference_6() { ExprHandle H(200); ExprHandle CW(32); ExprHandle CH(20); - Buffer a(BufHandle("a", {H, W}, kFloat)); + Placeholder a(BufHandle("a", {H, W}, kFloat)); Tensor* b = Compute( "b", {{H, "y"}, {W, "x"}}, [&](const VarHandle& y, const VarHandle& x) { return x * y; }); Tensor* c = Compute( "c", {{CH, "y"}, {CW, "x"}}, [&](const VarHandle& y, const VarHandle& x) { - return a(y + 100, x + 100) * b->call(y * 2, x * 5); + return a.load(y + 100, x + 100) * b->call(y * 2, x * 5); }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); @@ -328,11 +327,11 @@ void testBoundsInference_6() { void testBoundsInferenceNonOverlapping() { KernelScope kernel_scope; ExprHandle H(3); - Buffer a(BufHandle("a", {10}, kFloat)); + Placeholder a(BufHandle("a", {10}, kFloat)); Tensor* b = - Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); }); + Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); }); Tensor* c = Compute( - "c", {{H, "x"}}, [&](const VarHandle& x) { return a(x + H + 1); }); + "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H + 1); }); LoopNest l({b, c}); std::vector loops = NodeFinder::find(l.root_stmt()); @@ -389,11 +388,11 @@ void testBoundsInferenceNonOverlapping() { void testBoundsInferenceAdjacent() { KernelScope kernel_scope; ExprHandle H(6); - Buffer a(BufHandle("a", {20}, kFloat)); + Placeholder a(BufHandle("a", {20}, kFloat)); Tensor* b = - Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a(x); }); - Tensor* c = - Compute("c", {{H, "x"}}, [&](const VarHandle& x) { return a(x + H); }); + Compute("b", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x); }); + Tensor* c = Compute( + "c", {{H, "x"}}, [&](const VarHandle& x) { return a.load(x + H); }); LoopNest l({b, c}); std::vector loops = NodeFinder::find(l.root_stmt()); @@ -448,7 +447,7 @@ void testBoundsInferenceAdjacent() { void testMergeInferredBounds() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {10}, kFloat)); + Placeholder a(BufHandle("a", {10}, kFloat)); // There are seven cases to consider in mergeTensorAccesses(A, B) // * A is lower than B and does not overlap. @@ -518,7 +517,7 @@ void testMergeInferredBounds() { void testMergeInferredLoadStoreDiff() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {10}, kFloat)); + Placeholder a(BufHandle("a", {10}, kFloat)); // Loads and Stores do not merge: BoundsInfo info; @@ -549,7 +548,7 @@ void testMergeInferredLoadStoreDiff() { void testMergeInferred2DBounds() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {10, 10}, kFloat)); + Placeholder a(BufHandle("a", {10, 10}, kFloat)); // Non overlapping in both dimensions: BoundsInfo info; @@ -607,7 +606,7 @@ void testMergeInferred2DBounds() { void testMergeAdjacentBounds() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {10}, kFloat)); + Placeholder a(BufHandle("a", {10}, kFloat)); // Adjacent but not overlapping bounds can be merged. // e.g. {1-4} | {5-9} => {1-9} @@ -647,7 +646,7 @@ std::pair boundAsStringPair( void testMergeSymbolicBounds() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {10}, kFloat)); + Placeholder a(BufHandle("a", {10}, kFloat)); VarHandle W("W", kInt); VarHandle X("X", kInt); VarHandle Y("Y", kInt); @@ -757,7 +756,7 @@ void testMergeSymbolicBounds() { void testMergeSymbolicAdjacent() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {10}, kFloat)); + Placeholder a(BufHandle("a", {10}, kFloat)); VarHandle X("X", kInt); VarHandle Y("Y", kInt); diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index 2ad70e158ebf..6dba8c574c57 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -8,7 +8,6 @@ #include #include "test/cpp/tensorexpr/padded_buffer.h" -#include "torch/csrc/jit/tensorexpr/buffer.h" #include "torch/csrc/jit/tensorexpr/cuda_codegen.h" #include "torch/csrc/jit/tensorexpr/ir_simplifier.h" #include "torch/csrc/jit/tensorexpr/loopnest.h" @@ -31,8 +30,8 @@ void testCudaTestVectorAdd01_impl() { const int block_count = 16; const int block_size = 128; Dtype dtype = ToDtype(); - Buffer a_buf("a", dtype, {num_iter, block_count, block_size}); - Buffer b_buf("b", dtype, {num_iter, block_count, block_size}); + Placeholder a_buf("a", dtype, {num_iter, block_count, block_size}); + Placeholder b_buf("b", dtype, {num_iter, block_count, block_size}); Tensor* c = Compute( "c", { @@ -41,7 +40,7 @@ void testCudaTestVectorAdd01_impl() { {block_size, "t_id"}, }, [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) { - return a_buf(n, b_id, t_id) + b_buf(n, b_id, t_id); + return a_buf.load(n, b_id, t_id) + b_buf.load(n, b_id, t_id); }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); @@ -97,7 +96,7 @@ void testCudaSigmoid() { const int block_count = 16; const int block_size = 128; Dtype dtype = ToDtype(); - Buffer a_buf("a", dtype, {num_iter, block_count, block_size}); + Placeholder a_buf("a", dtype, {num_iter, block_count, block_size}); Tensor* c = Compute( "c", { @@ -106,7 +105,7 @@ void testCudaSigmoid() { {block_size, "t_id"}, }, [&](const VarHandle& n, const VarHandle& b_id, const VarHandle& t_id) { - return sigmoid(sigmoid(a_buf(n, b_id, t_id))); + return sigmoid(sigmoid(a_buf.load(n, b_id, t_id))); }); LoopNest l({c}); std::vector loops = l.getLoopStmtsFor(c); @@ -162,14 +161,14 @@ void testCudaTestVectorAdd01() { static void testCudaTestVectorAdd02_impl(int N, int block_size) { KernelScope kernel_scope; - Buffer a_buf("a", kFloat, {N}); - Buffer b_buf("b", kFloat, {N}); + Placeholder a_buf("a", kFloat, {N}); + Placeholder b_buf("b", kFloat, {N}); Tensor* c = Compute( "c", { {N, "N"}, }, - [&](const VarHandle& n) { return a_buf(n) + b_buf(n); }); + [&](const VarHandle& n) { return a_buf.load(n) + b_buf.load(n); }); LoopNest l({c}); For* n_outer; For* n_inner; @@ -224,9 +223,9 @@ void testCudaTestVectorAdd02() { void testCudaHalfCast() { KernelScope ks; auto half = ToDtype(); - Buffer a("a", half, {4}); + Placeholder a("a", half, {4}); Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { - return Cast::make(kFloat, a(i)); + return Cast::make(kFloat, a.load(i)); }); LoopNest l({b}); @@ -265,11 +264,11 @@ void testCudaDynamicShape2D() { auto testWithSize = [](int32_t M, int32_t N) { VarHandle m("m", kInt); VarHandle n("n", kInt); - Buffer a(BufHandle("a", {m, n}, kFloat)); - Buffer b(BufHandle("b", {m, n}, kFloat)); + Placeholder a(BufHandle("a", {m, n}, kFloat)); + Placeholder b(BufHandle("b", {m, n}, kFloat)); Tensor* c = Compute( "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { - return a(i, j) + b(i, j); + return a.load(i, j) + b.load(i, j); }); LoopNest l({c}); l.prepareForCodegen(); @@ -386,9 +385,9 @@ void testCudaDynamicShapeSplit() { KernelScope ks; constexpr int N = 4096; VarHandle n("n", kInt); - Buffer a(BufHandle("a", {n}, kFloat)); - Tensor* b = - Compute("b", {{n, "n"}}, [&](const VarHandle& i) { return a(i) * 2.0f; }); + Placeholder a(BufHandle("a", {n}, kFloat)); + Tensor* b = Compute( + "b", {{n, "n"}}, [&](const VarHandle& i) { return a.load(i) * 2.0f; }); LoopNest l({b}); For* outer; For* inner; @@ -436,8 +435,8 @@ void testCudaDynamicShapeSplit() { void testCudaOneBlockOneThreadGlobalReduce1() { const static int N = 1024; KernelScope kernel_scope; - Buffer data_buf("data", kFloat, {N}); - Buffer output_buf("output", kFloat, {1}); + Placeholder data_buf("data", kFloat, {N}); + Placeholder output_buf("output", kFloat, {1}); // The test adds the following code for trivial reduction: // for (int bidx = 0; bidx < 1; bidx++) { // blockIdx.x @@ -449,12 +448,12 @@ void testCudaOneBlockOneThreadGlobalReduce1() { // } // } - Store* init_store = Store::make(output_buf, {0}, 0.f, 1); + Store* init_store = output_buf.store({0}, 0.f); VarHandle i1("i1", kInt); - ExprHandle load_data = Load::make(data_buf, {i1}, 1); - ExprHandle load_output = Load::make(output_buf, {0}, 1); + ExprHandle load_data = Load::make(BufHandle(data_buf.data()), {i1}, 1); + ExprHandle load_output = Load::make(BufHandle(output_buf.data()), {0}, 1); ExprHandle add_value = load_output + load_data; - Store* store_output = Store::make(output_buf, {0}, add_value, 1); + Store* store_output = output_buf.store({0}, add_value); For* for_output = For::make(i1, 0, N, store_output); Stmt* reduce_block = Block::make({init_store, for_output}); VarHandle thread_idx("tidx", kInt); @@ -515,10 +514,10 @@ void testCudaOneBlockMultiThreadGlobalReduce1() { // b[0] = b[0] + a[t] // implied atomic // clang-format on - Buffer a_buf("a", kFloat, {N}); - Buffer b_buf("b", kFloat, {1}); + Placeholder a_buf("a", kFloat, {N}); + Placeholder b_buf("b", kFloat, {1}); - Store* init_store = Store::make(b_buf, {0}, 0.f, 1); + Store* init_store = b_buf.store({0}, 0.f); VarHandle t("t", kInt); VarHandle b("b", kInt); @@ -534,10 +533,10 @@ void testCudaOneBlockMultiThreadGlobalReduce1() { // for t in 0..1024: // thread-idx // b[0] = b[0] + a[t] // implied atomic - ExprHandle load_a = Load::make(a_buf, {t}, 1); - ExprHandle load_b = Load::make(b_buf, {0}, 1); + ExprHandle load_a = Load::make(BufHandle(a_buf.data()), {t}, 1); + ExprHandle load_b = Load::make(BufHandle(b_buf.data()), {0}, 1); ExprHandle add_value = load_b + load_a; - Store* store_b = Store::make(b_buf, {0}, add_value, 1); + Store* store_b = b_buf.store({0}, add_value); For* for_b = For::make(t, 0, N, store_b, thread_idx_options); Stmt* reduce_block = Block::make({for_init, for_b}); @@ -597,8 +596,8 @@ void testCudaNoThreadIdxWrite_1() { // covered by its own thread-idx const static int N = 1024; - Buffer a_buf("a", kFloat, {2}); - Buffer b_buf("b", kFloat, {N}); + Placeholder a_buf("a", kFloat, {2}); + Placeholder b_buf("b", kFloat, {N}); VarHandle k("k", kInt); VarHandle l("l", kInt); @@ -608,15 +607,15 @@ void testCudaNoThreadIdxWrite_1() { // a[0] = 0 // for n in 0..2: // a[0] = a[0] + n - Store* store_a0_0 = Store::make(a_buf, {0}, 0.f, 1); - ExprHandle load_a0 = Load::make(a_buf, {0}, 1); + Store* store_a0_0 = a_buf.store({0}, 0.f); + ExprHandle load_a0 = Load::make(BufHandle(a_buf.data()), {0}, 1); ExprHandle v1 = load_a0 + n; - Store* store_a0_v1 = Store::make(a_buf, {0}, v1, 1); + Store* store_a0_v1 = a_buf.store({0}, v1); For* loop_a_0 = For::make(n, 0, 2, store_a0_v1); // for m in 0..1024: // thread-idx // b[m] = m - Store* store_bm_m = Store::make(b_buf, {m}, m + 0.f, 1); + Store* store_bm_m = b_buf.store({m}, m + 0.f); LoopOptions thread_idx_options; thread_idx_options.set_gpu_thread_index(0); For* loop_b_1 = For::make(m, 0, N, store_bm_m, thread_idx_options); @@ -624,10 +623,10 @@ void testCudaNoThreadIdxWrite_1() { // a[1] = 1 // for l in 0..2: // a[1] = a[1] + l - Store* store_a1_1 = Store::make(a_buf, {1}, 1.f, 1); - ExprHandle load_a1 = Load::make(a_buf, {1}, 1); + Store* store_a1_1 = a_buf.store({1}, 1.f); + ExprHandle load_a1 = a_buf.load(1); ExprHandle v2 = load_a1 + l; - Store* store_a1_v2 = Store::make(a_buf, {1}, v2, 1); + Store* store_a1_v2 = a_buf.store({1}, v2); For* loop_a_1 = For::make(l, 0, 2, store_a1_v2); Stmt* reduce_block = @@ -699,8 +698,8 @@ void testCudaSharedMemReduce_1() { LoopOptions block_idx_opt; block_idx_opt.set_gpu_block_index(0); - Buffer a("a", kFloat, {1, M, N}); - Buffer b("b", kFloat, {1}); + Placeholder a("a", kFloat, {1, M, N}); + Placeholder b("b", kFloat, {1}); VarHandle k("k", kInt); VarHandle m("m", kInt); VarHandle n("n", kInt); @@ -729,7 +728,8 @@ void testCudaSharedMemReduce_1() { // for n in 0..64: // thread_idx // c(n) = c(n) + a(k, m, n) ExprHandle load_cn = Load::make(kFloat, c, {n}, 1); - ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n}, 1); + ExprHandle a_kmn = + Load::make(BufHandle(a.data()), {k * (M * N) + m * N + n}, 1); ExprHandle v_add = load_cn + a_kmn; Store* store_cn_v = Store::make(c, {n}, v_add); For* loop_n2 = For::make(n, 0, N, store_cn_v, thread_idx_opt); @@ -741,12 +741,12 @@ void testCudaSharedMemReduce_1() { // b(k) = 0 // for n in 0..64: // thread_idx // b(k) = b(k) + c(n) - Store* store_bk_0 = Store::make(b, {k}, 0.f, 1); + Store* store_bk_0 = b.store({k}, 0.f); block.push_back(store_bk_0); - ExprHandle load_bk = Load::make(b, {k}, 1); + ExprHandle load_bk = b.load(k); ExprHandle load_cn = Load::make(kFloat, c, {n}, 1); ExprHandle v_add = load_bk + load_cn; - Store* store_bk = Store::make(b, {k}, v_add, 1); + Store* store_bk = b.store({k}, v_add); For* loop_n3 = For::make(n, 0, N, store_bk, thread_idx_opt); block.push_back(loop_n3); } @@ -835,8 +835,8 @@ void testCudaLocalMemReduce_1() { LoopOptions block_idx_opt; block_idx_opt.set_gpu_block_index(0); - Buffer a("a", kFloat, {1, M, N}); - Buffer b("b", kFloat, {1}); + Placeholder a("a", kFloat, {1, M, N}); + Placeholder b("b", kFloat, {1}); VarHandle k("k", kInt); VarHandle m("m", kInt); VarHandle n("n", kInt); @@ -848,7 +848,7 @@ void testCudaLocalMemReduce_1() { std::vector block_k; { // b(k) = 0 - Store* store_bk_0 = Store::make(b, {k}, 0.f, 1); + Store* store_bk_0 = b.store({k}, 0.f); block_k.push_back(store_bk_0); } std::vector block_n; @@ -866,7 +866,7 @@ void testCudaLocalMemReduce_1() { // for m in 0..128: // c(0) = c(0) + a(k, m, n) ExprHandle load_c0 = Load::make(kFloat, c, {0}, 1); - ExprHandle a_kmn = Load::make(a, {k * (M * N) + m * N + n}, 1); + ExprHandle a_kmn = a.load(k * (M * N) + m * N + n); ExprHandle v_add = load_c0 + a_kmn; Store* store_c0_v = Store::make(c, {0}, v_add); For* loop_m = For::make(m, 0, M, store_c0_v); @@ -874,10 +874,10 @@ void testCudaLocalMemReduce_1() { } { // b(k) = b(k) + c(0) - ExprHandle load_bk = Load::make(b, {k}, 1); + ExprHandle load_bk = b.load(k); ExprHandle load_c0 = Load::make(kFloat, c, {0}, 1); ExprHandle v_add = load_bk + load_c0; - Store* store_bk = Store::make(b, {k}, v_add, 1); + Store* store_bk = b.store({k}, v_add); block_n.push_back(store_bk); } { @@ -930,9 +930,9 @@ void testCudaLocalMemReduce_1() { void testCudaHalfSupport() { KernelScope ks; auto half = ToDtype(); - Buffer a("a", half, {4}); + Placeholder a("a", half, {4}); Tensor* b = Compute("b", {{4, "n"}}, [&](const VarHandle& i) { - return Cast::make(half, ExprHandle(2.0f) * a(i)); + return Cast::make(half, ExprHandle(2.0f) * a.load(i)); }); Tensor* c = Compute("c", {{4, "n"}}, [&](const VarHandle& i) { @@ -985,11 +985,60 @@ void testCudaHalfSupport() { cudaFree(dDev); } +void testCudaHalfPropagation() { + KernelScope kernel_scope; + auto half = ToDtype(); + Placeholder a("a", half, {4}); + Tensor* relu = Compute("relu", {{4, "n"}}, [&](const VarHandle& i) { + return Max::make(a.load(i), ExprHandle(new HalfImm(0)), true); + }); + + LoopNest l({relu}); + l.prepareForCodegen(); + Stmt* s = l.root_stmt(); + CudaCodeGen cg(s, {a, relu}); + + std::ostringstream oss; + oss << *cg.stmt(); + + // Check the types used by the Max are Float. + const std::string& verification_pattern = + R"IR( +# CHECK: for ( +# CHECK: float v = float(a[n]); +# CHECK: relu[n] = half(Max(v, 0.f +# CHECK: })IR"; + + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + std::vector aData(4, 2.0f); + std::vector reluData(4, 0.0f); + at::Half* aDev = nullptr; + at::Half* reluDev = nullptr; + auto aSize = aData.size() * sizeof(aData[0]); + auto reluSize = reluData.size() * sizeof(reluData[0]); + + cudaMalloc(&aDev, aSize); + cudaMalloc(&reluDev, reluSize); + cudaMemcpy(aDev, aData.data(), aSize, cudaMemcpyHostToDevice); + cudaMemcpy(reluDev, reluData.data(), reluSize, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + cg.call({aDev, reluDev}); + cudaMemcpy(reluData.data(), reluDev, reluSize, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + assertAllEqual(aData, reluData); + + cudaFree(aDev); + cudaFree(reluDev); +} + void testCudaPrioritizeDependents() { KernelScope kernel_scope; - Buffer a("a", kFloat, {10}); - Buffer b("b", kFloat, {12}); - Buffer c("c", kFloat, {12}); + Placeholder a("a", kFloat, {10}); + Placeholder b("b", kFloat, {12}); + Placeholder c("c", kFloat, {12}); LoopOptions block_idx_opt; block_idx_opt.set_gpu_block_index(0); @@ -1002,13 +1051,13 @@ void testCudaPrioritizeDependents() { * c[i] = (i < 10 ? a[i] + b[i] : b[i]); * } */ - ExprHandle load_a = Load::make(a, {i}, 1); - ExprHandle load_b = Load::make(b, {i}, 1); + ExprHandle load_a = Load::make(BufHandle(a.data()), {i}, 1); + ExprHandle load_b = Load::make(BufHandle(b.data()), {i}, 1); ExprHandle cmp = CompareSelect::make(i, 10, CompareSelectOperation::kLT); ExprHandle ite = IfThenElse::make(cmp, Add::make(load_a, load_b), load_b); - For* loop = For::make( - i, 0, 12, Block::make({Store::make(c, {i}, ite, 1)}), block_idx_opt); + For* loop = + For::make(i, 0, 12, Block::make({c.store({i}, ite)}), block_idx_opt); CudaCodeGen cuda_cg(loop, a, b, c); @@ -1063,12 +1112,13 @@ void testCudaMaskBlockDim() { KernelScope kernel_scope; int A_SIZE = 100; int B_SIZE = 50; - Buffer a_buf("a", kFloat, {A_SIZE}); - Buffer b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute( - "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; }); + Placeholder a_buf("a", kFloat, {A_SIZE}); + Placeholder b_buf("b", kFloat, {B_SIZE}); + Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + return a_buf.load(i) + 10; + }); Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf(i) + b_buf(i); + return a_buf.load(i) + b_buf.load(i); }); LoopNest l({c, d}); @@ -1155,12 +1205,13 @@ void testCudaMaskThreadDim() { KernelScope kernel_scope; int A_SIZE = 50; int B_SIZE = 100; - Buffer a_buf("a", kFloat, {A_SIZE}); - Buffer b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute( - "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; }); + Placeholder a_buf("a", kFloat, {A_SIZE}); + Placeholder b_buf("b", kFloat, {B_SIZE}); + Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + return a_buf.load(i) + 10; + }); Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf(i / 2) + b_buf(i); + return a_buf.load(i / 2) + b_buf.load(i); }); LoopNest l({c, d}); @@ -1249,12 +1300,13 @@ void testCudaMaskMultiBlockDim() { KernelScope kernel_scope; int A_SIZE = 100; int B_SIZE = 50; - Buffer a_buf("a", kFloat, {A_SIZE}); - Buffer b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute( - "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; }); + Placeholder a_buf("a", kFloat, {A_SIZE}); + Placeholder b_buf("b", kFloat, {B_SIZE}); + Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + return a_buf.load(i) + 10; + }); Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf(i) + b_buf(i); + return a_buf.load(i) + b_buf.load(i); }); LoopNest l({c, d}); @@ -1342,12 +1394,13 @@ void testCudaMaskBlockAndThreadDim() { KernelScope kernel_scope; int A_SIZE = 100; int B_SIZE = 50; - Buffer a_buf("a", kFloat, {A_SIZE}); - Buffer b_buf("b", kFloat, {B_SIZE}); - Tensor* c = Compute( - "c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { return a_buf(i) + 10; }); + Placeholder a_buf("a", kFloat, {A_SIZE}); + Placeholder b_buf("b", kFloat, {B_SIZE}); + Tensor* c = Compute("c", {{A_SIZE, "i"}}, [&](const VarHandle& i) { + return a_buf.load(i) + 10; + }); Tensor* d = Compute("d", {{B_SIZE, "i"}}, [&](const VarHandle& i) { - return a_buf(i) + b_buf(i); + return a_buf.load(i) + b_buf.load(i); }); LoopNest l({c, d}); @@ -1434,19 +1487,19 @@ void testCudaMaskMultiDim() { int OUTER_SIZE = 10; int A_SIZE = 100; int B_SIZE = 50; - Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); Tensor* c = Compute( "C", {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return ExprHandle(2) * a_buf(i, j); + return ExprHandle(2) * a_buf.load(i, j); }); Tensor* d = Compute( "D", {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->call(i, j * 2) + b_buf(i, j); + return c->call(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); @@ -1564,19 +1617,19 @@ void testCudaMaskMultiDimSymbolic() { VarHandle OUTER_SIZE("OUTER_SIZE", kInt); VarHandle A_SIZE("A_SIZE", kInt); VarHandle B_SIZE("B_SIZE", kInt); - Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); Tensor* c = Compute( "C", {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return ExprHandle(2) * a_buf(i, j); + return ExprHandle(2) * a_buf.load(i, j); }); Tensor* d = Compute( "D", {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->call(i, j * 2) + b_buf(i, j); + return c->call(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); @@ -1700,10 +1753,10 @@ void testCudaMaskCompoundInnerLoop() { int OUTER_SIZE = 10; int A_SIZE = 100; int B_SIZE = 50; - Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); - Buffer c_buf("c", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer d_buf("d", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder c_buf("c", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder d_buf("d", kFloat, {OUTER_SIZE, B_SIZE}); // Can't build this using Compute and transforms yet. LoopOptions blockBound; @@ -1723,13 +1776,13 @@ void testCudaMaskCompoundInnerLoop() { j, 0, A_SIZE, - Store::make(c_buf, {i, j}, ExprHandle(2) * a_buf(i, j), 1), + c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)), threadBound), For::make( k, 0, B_SIZE, - Store::make(d_buf, {i, k}, c_buf(i, k * 2) + b_buf(i, k), 1), + d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)), threadBound)}), blockBound); @@ -1839,10 +1892,10 @@ void testCudaMaskInnerLoopOneBlock() { int OUTER_SIZE = 10; int A_SIZE = 100; int B_SIZE = 50; - Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); - Buffer c_buf("c", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer d_buf("d", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder c_buf("c", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder d_buf("d", kFloat, {OUTER_SIZE, B_SIZE}); // Can't build this using Compute and transforms yet. LoopOptions blockBound; @@ -1862,13 +1915,13 @@ void testCudaMaskInnerLoopOneBlock() { j, 0, A_SIZE, - Store::make(c_buf, {i, j}, ExprHandle(2) * a_buf(i, j), 1), + c_buf.store({i, j}, ExprHandle(2) * a_buf.load(i, j)), threadBound), For::make( k, 0, B_SIZE, - Store::make(d_buf, {i, k}, c_buf(i, k * 2) + b_buf(i, k), 1), + d_buf.store({i, k}, c_buf.load(i, k * 2) + b_buf.load(i, k)), threadBound)})); stmt = FlattenIndexes(stmt); @@ -1978,19 +2031,19 @@ void testCudaMaskMultiDimMultiAxis() { int OUTER_SIZE = 10; int A_SIZE = 30; int B_SIZE = 15; - Buffer a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); - Buffer b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); + Placeholder a_buf("a", kFloat, {OUTER_SIZE, A_SIZE}); + Placeholder b_buf("b", kFloat, {OUTER_SIZE, B_SIZE}); Tensor* c = Compute( "C", {{OUTER_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return ExprHandle(2) * a_buf(i, j); + return ExprHandle(2) * a_buf.load(i, j); }); Tensor* d = Compute( "D", {{OUTER_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->call(i, j * 2) + b_buf(i, j); + return c->call(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); @@ -2109,19 +2162,19 @@ void testCudaMaskMultiDimMultiLevel() { int OUTER_B_SIZE = 5; int A_SIZE = 30; int B_SIZE = 15; - Buffer a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE}); - Buffer b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE}); + Placeholder a_buf("a", kFloat, {OUTER_A_SIZE, A_SIZE}); + Placeholder b_buf("b", kFloat, {OUTER_B_SIZE, B_SIZE}); Tensor* c = Compute( "C", {{OUTER_A_SIZE, "i"}, {A_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return ExprHandle(2) * a_buf(i, j); + return ExprHandle(2) * a_buf.load(i, j); }); Tensor* d = Compute( "D", {{OUTER_B_SIZE, "i"}, {B_SIZE, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - return c->call(i, j * 2) + b_buf(i, j); + return c->call(i, j * 2) + b_buf.load(i, j); }); LoopNest l({c, d}); diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp index c1386a85764b..6fafb2813902 100644 --- a/test/cpp/tensorexpr/test_expr.cpp +++ b/test/cpp/tensorexpr/test_expr.cpp @@ -2,9 +2,7 @@ #include "test/cpp/tensorexpr/padded_buffer.h" #include "test/cpp/tensorexpr/test_utils.h" -#include "torch/csrc/jit/tensorexpr/buffer.h" #include "torch/csrc/jit/tensorexpr/eval.h" -#include "torch/csrc/jit/tensorexpr/function.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/ir_printer.h" #include "torch/csrc/jit/tensorexpr/loopnest.h" @@ -64,13 +62,13 @@ void testExprLetTest02() { void testExprLetStmtTest01() { KernelScope kernel_scope; - Buffer a_buf("a", kFloat, {1}); - Buffer b_buf("b", kFloat, {1}); + Placeholder a_buf("a", kFloat, {1}); + Placeholder b_buf("b", kFloat, {1}); - ExprHandle load_a = Load::make(a_buf, {0}, 1); + ExprHandle load_a = a_buf.load(0); VarHandle var = VarHandle("v", kFloat); Stmt* let_store = Let::make(var, load_a); - Stmt* store_b = Store::make(b_buf, {0}, var, 1); + Stmt* store_b = b_buf.store({0}, var); Block* block = Block::make({let_store, store_b}); SimpleIREvaluator eval(block, a_buf, b_buf); @@ -164,15 +162,33 @@ void testExprDoubleTest() { ASSERT_EQ(eval.value(), 2 + (3 * 3 + 4)); } +void testExprDisallowBoolArithmetic() { + KernelScope kernel_scope; + VarHandle x("x", kBool); + VarHandle y("y", kBool); + std::string error{"arithmetic binary operations on Bool not supported"}; + ASSERT_THROWS_WITH((x + y), error); + ASSERT_THROWS_WITH((x - y), error); + ASSERT_THROWS_WITH((x * y), error); + ASSERT_THROWS_WITH((x / y), error); + ASSERT_THROWS_WITH((x & y), error); + ASSERT_THROWS_WITH((x | y), error); + ASSERT_THROWS_WITH((x ^ y), error); + ASSERT_THROWS_WITH((x << y), error); + ASSERT_THROWS_WITH((x >> y), error); + ASSERT_THROWS_WITH(Max::make(x, y, /*propagate_nans=*/true), error); + ASSERT_THROWS_WITH(Min::make(x, y, /*propagate_nans=*/true), error); +} + void testExprVectorAdd01() { KernelScope kernel_scope; const int kVectorSize = 8; const int kVectorCount = 128; const int kTotalSize = kVectorSize * kVectorCount; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b_buf(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c_buf(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); /* Build the following: @@ -183,17 +199,14 @@ void testExprVectorAdd01() { } */ VarHandle index = VarHandle("index", kInt); - ExprHandle load_a = Load::make( - a_buf, + ExprHandle load_a = a_buf.loadWithMask( {Ramp::make(index * kVectorSize, 1, kVectorSize)}, Broadcast::make(1, kVectorSize)); - ExprHandle load_b = Load::make( - b_buf, + ExprHandle load_b = b_buf.loadWithMask( {Ramp::make(index * kVectorSize, 1, kVectorSize)}, Broadcast::make(1, kVectorSize)); ExprHandle value = load_a + load_b; - Stmt* store_c = Store::make( - c_buf, + Stmt* store_c = c_buf.storeWithMask( {Ramp::make(index * kVectorSize, 1, kVectorSize)}, value, Broadcast::make(1, kVectorSize)); @@ -220,28 +233,23 @@ void testExprVectorAdd01() { void testExprCompareSelectEQ() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 1); std::vector b_buffer(N, 1); std::vector c_buffer(N, 0); std::vector c_ref(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kEQ), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kEQ))); SimpleIREvaluator ir_eval(memcpy_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); @@ -264,15 +272,14 @@ void testExprCompareSelectDtypes() { // result = ((int)lhs == (int)rhs) ? (float)retval1 : (float)retval2 KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kFloat)); std::vector a_buffer(N, 1); std::vector b_buffer(N, 1); std::vector c_buffer(N, 0.0f); std::vector c_ref(N, 3.14f); - auto mask = IntImm::make(1); VarHandle i("i", kInt); // C[i] = (A[i] == B[i]) ? 3.14f : 2.78f // A and B are int, C is float. @@ -280,16 +287,14 @@ void testExprCompareSelectDtypes() { i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), + a.load(i), + b.load(i), FloatImm::make(3.14f), FloatImm::make(2.78f), - CompareSelectOperation::kEQ), - mask)); + CompareSelectOperation::kEQ))); SimpleIREvaluator ir_eval(select_expr, a, b, c); ir_eval(a_buffer, b_buffer, c_buffer); @@ -306,16 +311,14 @@ void testExprCompareSelectDtypes() { void testExprIntrinsicsDtypes() { KernelScope kernel_scope; constexpr int N = 256; - Buffer a(BufHandle("A", {N}, kDouble)); - Buffer b(BufHandle("B", {N}, kDouble)); + Placeholder a(BufHandle("A", {N}, kDouble)); + Placeholder b(BufHandle("B", {N}, kDouble)); std::vector a_buffer(N, -10.0); std::vector b_buffer(N, 0.0); std::vector b_ref(N, 10.0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto fabs_expr = For::make( - i, 0, N, Store::make(b, {i}, fabs(Load::make(a, {i}, mask)), mask)); + auto fabs_expr = For::make(i, 0, N, b.store({i}, fabs(a.load(i)))); SimpleIREvaluator ir_eval(fabs_expr, a, b); ir_eval(a_buffer, b_buffer); @@ -463,11 +466,11 @@ void testExprDynamicShapeAdd() { KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); - Buffer a(BufHandle("a", {n}, kFloat)); - Buffer b(BufHandle("b", {n}, kFloat)); - Buffer c(BufHandle("c", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); + Placeholder b(BufHandle("b", {n}, kFloat)); + Placeholder c(BufHandle("c", {n}, kFloat)); VarHandle i("i", kInt); - Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1)); + Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i))); std::vector aData(size, 1.0f); std::vector bData(size, 2.0f); std::vector cData(size, 0.0f); @@ -483,12 +486,10 @@ void testCond01() { KernelScope kernel_scope; const int N = 16; PaddedBuffer a_v(N); - Buffer a_buf("a", kFloat, {N}); + Placeholder a_buf("a", kFloat, {N}); VarHandle index = VarHandle("index", kInt); - Stmt* assign_x2 = - Store::make(BufHandle(a_buf.data()), {index}, cast(index) * 2, 1); - Stmt* assign_x3 = - Store::make(BufHandle(a_buf.data()), {index}, cast(index) * 3, 1); + Stmt* assign_x2 = a_buf.store({index}, cast(index) * 2); + Stmt* assign_x3 = a_buf.store({index}, cast(index) * 3); ExprHandle even_cond = CompareSelect::make(Mod::make(index, 2), 0, kEQ); Stmt* assign = Cond::make(even_cond, assign_x2, assign_x3); Stmt* for_stmt = For::make(index, 0, N, assign); @@ -546,9 +547,9 @@ void testStmtClone() { KernelScope kernel_scope; const int N = 16; - Buffer a_buf("a", kInt, {N}); + Placeholder a_buf("a", kInt, {N}); VarHandle index = VarHandle("index", kInt); - Stmt* body = Store::make(BufHandle(a_buf.data()), {index}, 5, 1); + Stmt* body = a_buf.store({index}, 5); Stmt* loop = For::make(index, 0, N, body); Stmt* cloned_loop = Stmt::clone(loop); @@ -562,7 +563,7 @@ void testStmtClone() { // Let's add another assign to the body in the cloned loop and verify that the // original statement hasn't changed while the cloned one has. - Stmt* body_addition = Store::make(BufHandle(a_buf.data()), {index}, 33, 1); + Stmt* body_addition = a_buf.store({index}, 33); Block* cloned_body = static_cast(static_cast(cloned_loop)->body()); cloned_body->append_stmt(body_addition); diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index ab916d370e82..d80710fa732b 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index ee8540eb58c4..7f4e1a0afc24 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -3,9 +3,7 @@ #include "test/cpp/tensorexpr/padded_buffer.h" #include "test/cpp/tensorexpr/test_utils.h" -#include "torch/csrc/jit/tensorexpr/buffer.h" #include "torch/csrc/jit/tensorexpr/eval.h" -#include "torch/csrc/jit/tensorexpr/function.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/ir_printer.h" #include "torch/csrc/jit/tensorexpr/ir_simplifier.h" @@ -163,17 +161,13 @@ void testLLVMByteToDoubleCastTest() { void testLLVMLetTest01() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kFloat)); + Placeholder a(BufHandle("A", {1}, kFloat)); std::vector v = {1, 0}; std::vector args({v.data()}); VarHandle x("x", kFloat); auto block = Block::make({ Let::make(x, 3.f), - Store::make( - a, - {IntImm::make(0)}, - ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f)), - IntImm::make(1)), + a.store({0}, ExprHandle(2.f) + (x * ExprHandle(3.f) + ExprHandle(4.f))), }); LLVMCodeGen cg(block, {a}); @@ -184,20 +178,17 @@ void testLLVMLetTest01() { void testLLVMLetTest02() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kFloat)); + Placeholder a(BufHandle("A", {1}, kFloat)); std::vector v = {1, 0}; std::vector args({v.data()}); VarHandle x("x", kFloat); VarHandle y("y", kFloat); - auto block = Block::make({ - Let::make(x, 3.f), - Let::make(y, 6.f), - Store::make( - a, - {IntImm::make(0)}, - ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)), - IntImm::make(1)), - }); + auto block = Block::make( + {Let::make(x, 3.f), + Let::make(y, 6.f), + a.store( + {IntImm::make(0)}, + ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f)))}); LLVMCodeGen cg(block, {a}); ASSERT_EQ(cg.value(args), 0); @@ -207,22 +198,20 @@ void testLLVMLetTest02() { void testLLVMLetTestMultitype() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kDouble)); + Placeholder a(BufHandle("A", {1}, kDouble)); std::vector v = {1, 0}; std::vector args({v.data()}); VarHandle x("x", kByte); VarHandle y("y", kHalf); - auto block = Block::make({ - Let::make(x, 3), - Let::make(y, 6.f), - Store::make( - a, - {IntImm::make(0)}, - Cast::make( - kDouble, - ExprHandle(2.f) + (x * ExprHandle(3.f) + y * ExprHandle(4.f))), - IntImm::make(1)), - }); + auto block = + Block::make({Let::make(x, 3), + Let::make(y, 6.f), + a.store( + {0}, + Cast::make( + kDouble, + ExprHandle(2.f) + + (x * ExprHandle(3.f) + y * ExprHandle(4.f))))}); LLVMCodeGen cg(block, {a}); ASSERT_EQ(cg.value(args), 0); @@ -231,7 +220,7 @@ void testLLVMLetTestMultitype() { void testLLVMBufferTest() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {32}, kFloat)); + Placeholder a(BufHandle("A", {32}, kFloat)); std::vector v(5); std::vector args({v.data()}); auto rv = IntImm::make(0); @@ -241,14 +230,14 @@ void testLLVMBufferTest() { void testLLVMBlockTest() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {32}, kInt)); + Placeholder a(BufHandle("A", {32}, kInt)); std::vector v = {1, 2}; std::vector args({v.data()}); auto block = Block::make({ - Store::make(a, {IntImm::make(0)}, IntImm::make(3), IntImm::make(1)), - Store::make(a, {IntImm::make(1)}, IntImm::make(4), IntImm::make(1)), - Store::make(a, {IntImm::make(0)}, IntImm::make(4), IntImm::make(1)), + a.store({0}, 3), + a.store({1}, 4), + a.store({0}, 4), }); LLVMCodeGen cg(block, {a}); @@ -259,16 +248,12 @@ void testLLVMBlockTest() { void testLLVMLoadStoreTest() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); + Placeholder a(BufHandle("A", {1}, kInt)); + Placeholder b(BufHandle("B", {1}, kInt)); std::vector a_buffer = {42}; std::vector b_buffer = {-11}; - auto store = Store::make( - b, - {IntImm::make(0)}, - Load::make(a, {IntImm::make(0)}, IntImm::make(1)), - IntImm::make(1)); + auto store = b.store({0}, a.load(0)); LLVMCodeGen cg(store, {a, b}); std::vector args({a_buffer.data(), b_buffer.data()}); ASSERT_EQ(cg.value(args), 0); @@ -278,21 +263,14 @@ void testLLVMLoadStoreTest() { void testLLVMIfThenElseTest() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); - Buffer c(BufHandle("C", {1}, kInt)); + Placeholder a(BufHandle("A", {1}, kInt)); + Placeholder b(BufHandle("B", {1}, kInt)); + Placeholder c(BufHandle("C", {1}, kInt)); std::vector a_buffer = {42}; std::vector b_buffer = {-11}; std::vector c_buffer = {1}; - auto store = Store::make( - b, - {IntImm::make(0)}, - IfThenElse::make( - Load::make(c, {IntImm::make(0)}, IntImm::make(1)), // cond - Load::make(a, {IntImm::make(0)}, IntImm::make(1)), // then - IntImm::make(0)), // else - IntImm::make(1)); + auto store = b.store({0}, IfThenElse::make(c.load(0), a.load(0), 0)); LLVMCodeGen cg(store, {a, b, c}); std::vector args({a_buffer.data(), b_buffer.data(), c_buffer.data()}); ASSERT_EQ(cg.value(args), 0); @@ -302,15 +280,15 @@ void testLLVMIfThenElseTest() { void testLLVMVecLoadStoreTest() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); + Placeholder a(BufHandle("A", {1}, kInt)); + Placeholder b(BufHandle("B", {1}, kInt)); std::vector a_buffer = {1, 1, 1, 1}; std::vector b_buffer = {2, 2, 2, 2}; - auto store = Store::make( - b, + auto store = b.storeWithMask( {Ramp::make(0, 1, 4)}, - Load::make(a, {Ramp::make(0, 1, 4)}, Broadcast::make(IntImm::make(1), 4)), + a.loadWithMask( + {Ramp::make(0, 1, 4)}, Broadcast::make(IntImm::make(1), 4)), Broadcast::make(IntImm::make(1), 4)); LLVMCodeGen cg(store, {a, b}); std::vector args({a_buffer.data(), b_buffer.data()}); @@ -328,16 +306,14 @@ void testLLVMVecLoadStoreTest() { #define FLOAT_INTRINSICS_TEST(Name, Lanes) \ void testLLVMVecFloat_##Name##Lane##Lanes##Test() { \ KernelScope kernel_scope; \ - Buffer a(BufHandle("A", {1}, kFloat)); \ - Buffer b(BufHandle("B", {1}, kFloat)); \ + Placeholder a(BufHandle("A", {1}, kFloat)); \ + Placeholder b(BufHandle("B", {1}, kFloat)); \ float val = 0.5f; \ std::vector a_buffer(Lanes, val); \ std::vector b_buffer(Lanes, val); \ - auto store = Store::make( \ - b, \ + auto store = b.storeWithMask( \ {Ramp::make(0, 1, Lanes)}, \ - Name(Load::make( \ - a, \ + Name(a.loadWithMask( \ {Ramp::make(0, 1, Lanes)}, \ Broadcast::make(IntImm::make(1), Lanes))), \ Broadcast::make(IntImm::make(1), Lanes)); \ @@ -373,16 +349,14 @@ FLOAT_INTRINSICS_TEST(lgamma, 8) #define DOUBLE_INTRINSICS_TEST(Name, Lanes) \ void testLLVMVecDouble_##Name##Lane##Lanes##Test() { \ KernelScope kernel_scope; \ - Buffer a(BufHandle("A", {1}, kDouble)); \ - Buffer b(BufHandle("B", {1}, kDouble)); \ + Placeholder a(BufHandle("A", {1}, kDouble)); \ + Placeholder b(BufHandle("B", {1}, kDouble)); \ float val = 0.5f; \ std::vector a_buffer(Lanes, val); \ std::vector b_buffer(Lanes, val); \ - auto store = Store::make( \ - b, \ + auto store = b.storeWithMask( \ {Ramp::make(0, 1, Lanes)}, \ - Name(Load::make( \ - a, \ + Name(a.loadWithMask( \ {Ramp::make(0, 1, Lanes)}, \ Broadcast::make(IntImm::make(1), Lanes))), \ Broadcast::make(IntImm::make(1), Lanes)); \ @@ -417,13 +391,12 @@ DOUBLE_INTRINSICS_TEST(lgamma, 4) void testLLVMVectorizerLoadStoreTest() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + Placeholder a(BufHandle("A", {1}, kInt)); - Tensor* c = Compute("c", {{4, "i"}}, [&](const VarHandle& i) { - return Load::make(a, {i}, 1); - }); + Tensor* c = + Compute("c", {{4, "i"}}, [&](const VarHandle& i) { return a.load(i); }); - Buffer c_buf(BufHandle(c->func_var())); + Placeholder c_buf(BufHandle(c->buf())); LoopNest l({c}); Stmt* s = l.root_stmt(); l.vectorize(dynamic_cast(s)->front()); @@ -442,15 +415,13 @@ void testLLVMVectorizerLoadStoreTest() { void testLLVMMemcpyTest() { KernelScope kernel_scope; constexpr int N = 32; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); std::vector a_buffer(N, 42); std::vector b_buffer(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = - For::make(i, 0, N, Store::make(b, {i}, Load::make(a, {i}, mask), mask)); + auto expr = For::make(i, 0, N, b.store({i}, a.load(i))); LLVMCodeGen cg(expr, {a, b}); @@ -466,12 +437,11 @@ void testLLVMMemcpyTest() { void testLLVMBzeroTest() { KernelScope kernel_scope; constexpr int N = 32; - Buffer b(BufHandle("B", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); std::vector b_buffer(N, 11); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make(i, 0, N, Store::make(b, {i}, IntImm::make(0), mask)); + auto expr = For::make(i, 0, N, b.store({i}, 0)); LLVMCodeGen cg(expr, {b}); @@ -485,24 +455,15 @@ void testLLVMBzeroTest() { void testLLVMElemwiseAdd() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Add::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask)), - mask)); + auto expr = For::make(i, 0, N, c.store({i}, Add::make(a.load(i), b.load(i)))); LLVMCodeGen cg(expr, {a, b, c}); @@ -520,21 +481,15 @@ void testLLVMElemwiseAdd() { void testLLVMElemwiseAddFloat() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); - Buffer c(BufHandle("C", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); + Placeholder c(BufHandle("C", {N}, kFloat)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, {i}, Load::make(a, {i}, mask) + Load::make(b, {i}, mask), mask)); + auto expr = For::make(i, 0, N, c.store({i}, a.load(i) + b.load(i))); LLVMCodeGen cg(expr, {a, b, c}); @@ -552,8 +507,8 @@ void testLLVMElemwiseAddFloat() { void testLLVMElemwiseLog10Float() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); std::vector a_buffer(N, 10.0f); std::vector b_buffer(N, 2.0f); @@ -563,10 +518,9 @@ void testLLVMElemwiseLog10Float() { i, 0, N / 4, - Store::make( - b, + b.storeWithMask( {Ramp::make(i * 4, 1, 4)}, - log10(Load::make(a, {Ramp::make(i * 4, 1, 4)}, mask)), + log10(a.loadWithMask({Ramp::make(i * 4, 1, 4)}, mask)), mask)); LLVMCodeGen cg(expr, {a, b}); @@ -583,8 +537,8 @@ void testLLVMElemwiseLog10Float() { void testLLVMElemwiseLog1pFloat() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); std::vector a_buffer(N, expf(3.0f) - 1); std::vector b_buffer(N, 42.0f); @@ -594,10 +548,9 @@ void testLLVMElemwiseLog1pFloat() { i, 0, N / 4, - Store::make( - b, + b.storeWithMask( {Ramp::make(i * 4, 1, 4)}, - log1p(Load::make(a, {Ramp::make(i * 4, 1, 4)}, mask)), + log1p(a.loadWithMask({Ramp::make(i * 4, 1, 4)}, mask)), mask)); LLVMCodeGen cg(expr, {a, b}); @@ -614,24 +567,16 @@ void testLLVMElemwiseLog1pFloat() { void testLLVMElemwiseMaxInt() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false), - mask)); + auto expr = + For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false))); LLVMCodeGen cg(expr, {a, b, c}); @@ -649,24 +594,16 @@ void testLLVMElemwiseMaxInt() { void testLLVMElemwiseMinInt() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false), - mask)); + auto expr = + For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false))); LLVMCodeGen cg(expr, {a, b, c}); @@ -684,24 +621,16 @@ void testLLVMElemwiseMinInt() { void testLLVMElemwiseMaxFloat() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); - Buffer c(BufHandle("C", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); + Placeholder c(BufHandle("C", {N}, kFloat)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false), - mask)); + auto expr = + For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false))); LLVMCodeGen cg(expr, {a, b, c}); @@ -719,24 +648,16 @@ void testLLVMElemwiseMaxFloat() { void testLLVMElemwiseMaxNaNFloat() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); - Buffer c(BufHandle("C", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); + Placeholder c(BufHandle("C", {N}, kFloat)); std::vector a_buffer(N, NAN); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Max::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false), - mask)); + auto expr = + For::make(i, 0, N, c.store({i}, Max::make(a.load(i), b.load(i), false))); LLVMCodeGen cg(expr, {a, b, c}); @@ -755,24 +676,16 @@ void testLLVMElemwiseMaxNaNFloat() { void testLLVMElemwiseMinFloat() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); - Buffer c(BufHandle("C", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); + Placeholder c(BufHandle("C", {N}, kFloat)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false), - mask)); + auto expr = + For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false))); LLVMCodeGen cg(expr, {a, b, c}); @@ -790,24 +703,16 @@ void testLLVMElemwiseMinFloat() { void testLLVMElemwiseMinNaNFloat() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); - Buffer c(BufHandle("C", {N}, kFloat)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); + Placeholder c(BufHandle("C", {N}, kFloat)); std::vector a_buffer(N, NAN); std::vector b_buffer(N, 1); std::vector c_buffer(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Min::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask), false), - mask)); + auto expr = + For::make(i, 0, N, c.store({i}, Min::make(a.load(i), b.load(i), false))); LLVMCodeGen cg(expr, {a, b, c}); @@ -826,24 +731,15 @@ void testLLVMElemwiseMinNaNFloat() { void testLLVMElemwiseMod() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 41); std::vector b_buffer(N, 23); std::vector c_buffer(N, 18); - auto mask = IntImm::make(1); VarHandle i("i", kInt); - auto expr = For::make( - i, - 0, - N, - Store::make( - c, - {i}, - Mod::make(Load::make(a, {i}, mask), Load::make(b, {i}, mask)), - mask)); + auto expr = For::make(i, 0, N, c.store({i}, Mod::make(a.load(i), b.load(i)))); LLVMCodeGen cg(expr, {a, b, c}); @@ -861,9 +757,9 @@ void testLLVMElemwiseMod() { void testLLVMCompareSelectIntEQ() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kInt)); + Placeholder b(BufHandle("B", {N}, kInt)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 1); std::vector b_buffer(N, 1); std::vector c_buffer(N, 0); @@ -874,20 +770,15 @@ void testLLVMCompareSelectIntEQ() { c_ref[i] = 0; } - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kEQ), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kEQ))); LLVMCodeGen cg(expr, {a, b, c}); @@ -907,27 +798,22 @@ void testLLVMCompareSelectIntEQ() { void testLLVMCompareSelectFloatEQ() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kFloat)); - Buffer b(BufHandle("B", {N}, kFloat)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kFloat)); + Placeholder b(BufHandle("B", {N}, kFloat)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 1.0f); std::vector b_buffer(N, 1.0f); std::vector c_buffer(N, 0); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kEQ), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kEQ))); LLVMCodeGen cg(expr, {a, b, c}); @@ -946,9 +832,9 @@ void testLLVMCompareSelectFloatEQ() { void testLLVMCompareSelectByteGT() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kByte)); - Buffer b(BufHandle("B", {N}, kByte)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kByte)); + Placeholder b(BufHandle("B", {N}, kByte)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 0); std::vector b_buffer(N, 0); std::vector c_buffer(N, 0); @@ -959,20 +845,15 @@ void testLLVMCompareSelectByteGT() { c_ref[i] = 1; } - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kGT), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kGT))); LLVMCodeGen cg(expr, {a, b, c}); @@ -992,28 +873,23 @@ void testLLVMCompareSelectByteGT() { void testLLVMCompareSelectByteGE() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kByte)); - Buffer b(BufHandle("B", {N}, kByte)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kByte)); + Placeholder b(BufHandle("B", {N}, kByte)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 0); std::vector b_buffer(N, 0); std::vector c_buffer(N, 0); std::vector c_ref(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kGE), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kGE))); LLVMCodeGen cg(expr, {a, b, c}); @@ -1033,9 +909,9 @@ void testLLVMCompareSelectByteGE() { void testLLVMCompareSelectByteLT() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kByte)); - Buffer b(BufHandle("B", {N}, kByte)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kByte)); + Placeholder b(BufHandle("B", {N}, kByte)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 0); std::vector b_buffer(N, 128); std::vector c_buffer(N, 0); @@ -1046,20 +922,15 @@ void testLLVMCompareSelectByteLT() { c_ref[i] = 0; } - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kLT), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kLT))); LLVMCodeGen cg(expr, {a, b, c}); @@ -1079,28 +950,23 @@ void testLLVMCompareSelectByteLT() { void testLLVMCompareSelectByteLE() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kByte)); - Buffer b(BufHandle("B", {N}, kByte)); - Buffer c(BufHandle("C", {N}, kInt)); + Placeholder a(BufHandle("A", {N}, kByte)); + Placeholder b(BufHandle("B", {N}, kByte)); + Placeholder c(BufHandle("C", {N}, kInt)); std::vector a_buffer(N, 0); std::vector b_buffer(N, 128); std::vector c_buffer(N, 0); std::vector c_ref(N, 1); - auto mask = IntImm::make(1); VarHandle i("i", kInt); auto expr = For::make( i, 0, N, - Store::make( - c, + c.store( {i}, CompareSelect::make( - Load::make(a, {i}, mask), - Load::make(b, {i}, mask), - CompareSelectOperation::kLE), - mask)); + a.load(i), b.load(i), CompareSelectOperation::kLE))); LLVMCodeGen cg(expr, {a, b, c}); @@ -1119,10 +985,9 @@ void testLLVMCompareSelectByteLE() { void testLLVMStoreFloat() { KernelScope kernel_scope; - Buffer result(BufHandle("result", {1}, kFloat)); + Placeholder result(BufHandle("result", {1}, kFloat)); std::vector result_buffer = {0.0f}; - auto expr = Store::make( - result, {IntImm::make(0)}, FloatImm::make(3.14f), IntImm::make(1)); + auto expr = result.store({0}, FloatImm::make(3.14f)); LLVMCodeGen cg(expr, {result}); std::vector args({result_buffer.data()}); ASSERT_EQ(cg.value(args), 0); @@ -1137,7 +1002,7 @@ void testLLVMSimpleMath01() { }); LoopNest l({tensor}); Stmt* stmt = l.root_stmt(); - Buffer f_buf(BufHandle(tensor->func_var())); + Placeholder f_buf(BufHandle(tensor->buf())); LLVMCodeGen cg(stmt, {f_buf}); PaddedBuffer f_v(N, "f_v"); @@ -1154,13 +1019,13 @@ void testLLVMSimpleMath01() { void testLLVMComputeMul() { KernelScope kernel_scope; const int N = 1024; - Buffer a(BufHandle("a", {N}, kFloat)); - Buffer b(BufHandle("b", {N}, kFloat)); + Placeholder a(BufHandle("a", {N}, kFloat)); + Placeholder b(BufHandle("b", {N}, kFloat)); Tensor* c = Compute("c", {{N, "i"}}, [&](const VarHandle& i) { - return Load::make(a, {i}, 1) * Load::make(b, {i}, 1); + return a.load(i) * b.load(i); }); - Buffer c_buf(BufHandle(c->func_var())); + Placeholder c_buf(BufHandle(c->buf())); LoopNest l({c}); Stmt* s = l.root_stmt(); @@ -1178,15 +1043,14 @@ void testLLVMBroadcastAdd() { KernelScope kernel_scope; const int M = 32; const int N = 1024; - Buffer a(BufHandle("a", {M, N}, kFloat)); - Buffer b(BufHandle("b", {N}, kFloat)); + Placeholder a(BufHandle("a", {M, N}, kFloat)); + Placeholder b(BufHandle("b", {N}, kFloat)); Tensor* c = Compute( "c", {{M, "i"}, {N, "j"}}, [&](const VarHandle& i, const VarHandle& j) { - ExprHandle mask(1); - return Load::make(a, {i, j}, mask) + Load::make(b, {j}, mask); + return a.load(i, j) + b.load(j); }); - Buffer c_buf(BufHandle(c->func_var())); + Placeholder c_buf(BufHandle(c->buf())); LoopNest l({c}); l.prepareForCodegen(); Stmt* s = l.root_stmt(); @@ -1225,11 +1089,11 @@ void testLLVMDynamicShapeAdd() { KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); - Buffer a(BufHandle("a", {n}, kFloat)); - Buffer b(BufHandle("b", {n}, kFloat)); - Buffer c(BufHandle("c", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); + Placeholder b(BufHandle("b", {n}, kFloat)); + Placeholder c(BufHandle("c", {n}, kFloat)); VarHandle i("i", kInt); - Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1)); + Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i))); std::vector aData(size, 1.0f); std::vector bData(size, 2.0f); std::vector cData(size, 0.0f); @@ -1247,11 +1111,11 @@ void testLLVMBindDynamicShapeAdd() { KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); - Buffer a(BufHandle("a", {n}, kFloat)); - Buffer b(BufHandle("b", {n}, kFloat)); - Buffer c(BufHandle("c", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); + Placeholder b(BufHandle("b", {n}, kFloat)); + Placeholder c(BufHandle("c", {n}, kFloat)); VarHandle i("i", kInt); - Stmt* s = For::make(i, 0, n, Store::make(c, {i}, a(i) + b(i), 1)); + Stmt* s = For::make(i, 0, n, c.store({i}, a.load(i) + b.load(i))); std::vector aData(size, 1.0f); std::vector bData(size, 2.0f); std::vector cData(size, 0.0f); @@ -1268,10 +1132,11 @@ void testLLVMTensorDynamicShapeAdd() { KernelScope kernel_scope; auto testWithSize = [](int32_t size) { VarHandle n("n", kInt); - Buffer a(BufHandle("a", {n}, kFloat)); - Buffer b(BufHandle("b", {n}, kFloat)); - Tensor* c = Compute( - "c", {{n, "n"}}, [&](const VarHandle& i) { return a(i) + b(i); }); + Placeholder a(BufHandle("a", {n}, kFloat)); + Placeholder b(BufHandle("b", {n}, kFloat)); + Tensor* c = Compute("c", {{n, "n"}}, [&](const VarHandle& i) { + return a.load(i) + b.load(i); + }); LoopNest l({c}); Stmt* s = l.root_stmt(); LLVMCodeGen cg(s, {a, b, c, n}); @@ -1291,11 +1156,11 @@ void testLLVMDynamicShape2D() { auto testWithSize = [](int32_t M, int32_t N) { VarHandle m("m", kInt); VarHandle n("n", kInt); - Buffer a(BufHandle("a", {m, n}, kFloat)); - Buffer b(BufHandle("b", {m, n}, kFloat)); + Placeholder a(BufHandle("a", {m, n}, kFloat)); + Placeholder b(BufHandle("b", {m, n}, kFloat)); Tensor* c = Compute( "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { - return a(i, j) + b(i, j); + return a.load(i, j) + b.load(i, j); }); LoopNest l({c}); l.prepareForCodegen(); @@ -1323,7 +1188,7 @@ void testLLVMEmptyStmt() { void testLLVMEliminatedStmt() { KernelScope kernel_scope; - Buffer a(BufHandle("a", {1}, kFloat)); + Placeholder a(BufHandle("a", {1}, kFloat)); Tensor* c = Compute("c", {{0, "m"}}, [&](const VarHandle& m) { return m; }); @@ -1344,7 +1209,7 @@ void testLLVMSimpleReduction() { int N = 64; const int kTotalSize = M * N; - Buffer a("a", kFloat, {1, M, N}); + Placeholder a("a", kFloat, {1, M, N}); // TODO: why doesn't implicit vector work? std::vector axis = {DimArg(1)}; @@ -1383,7 +1248,7 @@ void testLLVMRFactorReduction() { int N = 64; const int kTotalSize = M * N; - Buffer a("a", kFloat, {1, M, N}); + Placeholder a("a", kFloat, {1, M, N}); // TODO: why doesn't implicit vector work? std::vector axis = {DimArg(1)}; @@ -1433,7 +1298,7 @@ void DISABLED_testLLVMRFactorVectorizedReduction() { int N = 64; const int kTotalSize = M * N; - Buffer a("a", kFloat, {1, M, N}); + Placeholder a("a", kFloat, {1, M, N}); // TODO: why doesn't implicit vector work? std::vector axis = {DimArg(1)}; diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 602eb116e7b9..60c8fb1d62c0 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -7,9 +7,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -577,11 +575,11 @@ void testExprSplitWithMask01() { KernelScope kernel_scope; const int M = 26; const int N = 5; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {M, N}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {M, N}); Tensor* tensor = Compute( "f", {{M, "m"}, {N, "n"}}, [&](const ExprHandle& m, const ExprHandle& n) { - return a_buf(m, n) + b_buf(m, n) + 1.0f; + return a_buf.load(m, n) + b_buf.load(m, n) + 1.0f; }); For* n_outer; For* n_inner; @@ -609,13 +607,47 @@ void testExprSplitWithMask01() { ExpectAllNear(c_v, c_ref, 1e-5); } +// Tests the case where we split a loop cleanly multiple times, we should not +// insert any masks. +void testExprSplitWithMaskRepeatedNoMask() { + KernelScope kernel_scope; + const int M = 64; + Placeholder a_buf("a", kFloat, {M}); + Placeholder b_buf("b", kFloat, {M}); + Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { + return a_buf.load(m) + b_buf.load(m) + 1.0f; + }); + + LoopNest l({tensor}); + std::vector loops = l.getLoopStmtsFor(tensor); + For *outer, *mid, *inner; + l.splitWithMask(loops[0], 4, &outer, &inner); + l.splitWithMask(outer, 4, &outer, &mid); + + Stmt* stmt1 = IRSimplifier::simplify(l.root_stmt()); + std::ostringstream oss; + oss << *stmt1; + + // Two splits mean 3 loops, but should need no masks in this case. + const std::string& verification_pattern = + R"IR( +# CHECK: for ( +# CHECK-NOT: if ( +# CHECK: for ( +# CHECK-NOT: if ( +# CHECK: for ( +# CHECK-NOT: if ( +# CHECK: f[)IR"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); +} + void testSplitWithTailWithLoopOptions() { KernelScope kernel_scope; const int M = 21; - Buffer a_buf("a", kFloat, {M}); - Buffer b_buf("b", kFloat, {M}); + Placeholder a_buf("a", kFloat, {M}); + Placeholder b_buf("b", kFloat, {M}); Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { - return a_buf(m) + b_buf(m) + 1.0f; + return a_buf.load(m) + b_buf.load(m) + 1.0f; }); For *outer, *inner, *tail; @@ -642,10 +674,10 @@ void testSplitWithTailWithLoopOptions() { void testSplitWithMaskWithLoopOptions() { KernelScope kernel_scope; const int M = 21; - Buffer a_buf("a", kFloat, {M}); - Buffer b_buf("b", kFloat, {M}); + Placeholder a_buf("a", kFloat, {M}); + Placeholder b_buf("b", kFloat, {M}); Tensor* tensor = Compute("f", {{M, "m"}}, [&](const ExprHandle& m) { - return a_buf(m) + b_buf(m) + 1.0f; + return a_buf.load(m) + b_buf.load(m) + 1.0f; }); For *outer, *inner; @@ -667,13 +699,13 @@ void testScheduleBroadcastAddBuffer() { const int M = 4; const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {N, K}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {N, K}); Tensor* c = Compute( "broadcast_add", {{M, "m"}, {N, "n"}, {K, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) + b_buf(n, k); + return a_buf.load(m, n) + b_buf.load(n, k); }); LoopNest l({c}); Stmt* stmt = l.root_stmt(); @@ -716,13 +748,13 @@ void testScheduleFunctionCall01() { const int M = 4; const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {N, K}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {N, K}); Tensor* c = Compute( "broadcast_add", {{M, "m"}, {N, "n"}, {K, "k"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) + b_buf(n, k); + return a_buf.load(m, n) + b_buf.load(n, k); }); Tensor* d = Compute( "d", @@ -773,22 +805,22 @@ void testScheduleInlineSimple() { const int M = 4; const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {N, K}); - Buffer c_buf("c", kFloat, {M, N}); - Buffer d_buf("d", kFloat, {M, K}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {N, K}); + Placeholder c_buf("c", kFloat, {M, N}); + Placeholder d_buf("d", kFloat, {M, K}); Tensor* x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) * b_buf(n, k); + return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor* y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k); + return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k); }); LoopNest l1({y}); @@ -854,22 +886,22 @@ void InlineFunc01Helper(const std::vector& inline_order) { const int M = 4; const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {N, K}); - Buffer c_buf("c", kFloat, {M, N}); - Buffer d_buf("d", kFloat, {M, K}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {N, K}); + Placeholder c_buf("c", kFloat, {M, N}); + Placeholder d_buf("d", kFloat, {M, K}); Tensor* x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) * b_buf(n, k); + return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor* y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k); + return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k); }); Tensor* z = Compute( "z", @@ -942,8 +974,9 @@ void InlineFunc01Helper(const std::vector& inline_order) { "z", {{M, "m3"}, {N, "n3"}, {K, "k3"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) * b_buf(n, k) + - (c_buf(m, n) * d_buf(m, k) + a_buf(m, n) * b_buf(n, k)); + return a_buf.load(m, n) * b_buf.load(n, k) + + (c_buf.load(m, n) * d_buf.load(m, k) + + a_buf.load(m, n) * b_buf.load(n, k)); }); LoopNest l2({z2}); l2.prepareForCodegen(); @@ -1090,14 +1123,14 @@ void testScheduleInlineIntrinsics() { const int M = 4; const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {N, K}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {N, K}); Tensor* x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) * b_buf(n, k); + return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor* y = Compute( "y", @@ -1455,11 +1488,11 @@ void testScheduleFuserStyle() { const int kVectorCount = 128; const int kTotalSize = kVectorSize * kVectorCount; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); Tensor* b = Compute( "f", {{kTotalSize, "i"}}, [&](const std::vector& axes) { - return a_buf(axes[0]) + 11.0f; + return a_buf.load(axes[0]) + 11.0f; }); Tensor* c = Compute( @@ -1488,19 +1521,19 @@ void testScheduleFuserThreeArg() { const int kVectorCount = 128; const int kTotalSize = kVectorSize * kVectorCount; - Buffer a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Buffer b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); - Buffer c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); - Buffer d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder a(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder b(BufHandle("B", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder c(BufHandle("C", {ExprHandle(kTotalSize)}, kFloat)); + Placeholder d(BufHandle("D", {ExprHandle(kTotalSize)}, kFloat)); Tensor* e = Compute("e", {{kTotalSize, "i"}}, [&](const VarHandle& i) { - return a(i) + b(i); + return a.load(i) + b.load(i); }); Tensor* f = Compute("f", {{kTotalSize, "i"}}, [&](const VarHandle& i) { - return (*e)(i) + c(i); + return e->call(i) + c.load(i); }); Tensor* g = Compute("g", {{kTotalSize, "i"}}, [&](const VarHandle& i) { - return (*f)(i) + d(i); + return f->call(i) + d.load(i); }); LoopNest l({g}); @@ -1526,11 +1559,11 @@ void testScheduleDynamicShape2D() { auto testWithSize = [](int32_t M, int32_t N) { VarHandle m("m", kInt); VarHandle n("n", kInt); - Buffer a(BufHandle("a", {m, n}, kFloat)); - Buffer b(BufHandle("b", {m, n}, kFloat)); + Placeholder a(BufHandle("a", {m, n}, kFloat)); + Placeholder b(BufHandle("b", {m, n}, kFloat)); Tensor* c = Compute( "c", {{m, "m"}, {n, "n"}}, [&](const VarHandle& i, const VarHandle& j) { - return a(i, j) + b(i, j); + return a.load(i, j) + b.load(i, j); }); LoopNest l({c}); Stmt* s = l.root_stmt(); @@ -2050,16 +2083,19 @@ void testLoopNestReorderExtraStatements() { }); LoopNest l({tensor}); - Buffer extra(BufHandle("res", {6, 3}, kFloat)); + Placeholder extra(BufHandle("res", {6, 3}, kFloat)); auto loops = l.getLoopStmtsFor(tensor); VarHandle i = VarHandle(loops[0]->var()); - Stmt* store_1 = Store::make(extra, {i, 0}, ExprHandle(1.f), 1); - Stmt* store_2 = Store::make(extra, {i, 1}, ExprHandle(2.f), 1); + Stmt* store_1 = + Store::make(BufHandle(extra.data()), {i, 0}, ExprHandle(1.f), 1); + Stmt* store_2 = + Store::make(BufHandle(extra.data()), {i, 1}, ExprHandle(2.f), 1); // stmt 3 is the Function body. - Stmt* store_3 = Store::make(extra, {i, 2}, ExprHandle(4.f), 1); + Stmt* store_3 = + Store::make(BufHandle(extra.data()), {i, 2}, ExprHandle(4.f), 1); loops[0]->body()->prepend_stmt(store_1); loops[1]->body()->prepend_stmt(store_2); @@ -2190,16 +2226,16 @@ void LoopNestReorderTestHelper( [](const std::vector&) { return -1; }); LoopNest l({c}); - Buffer extra(BufHandle("extra", {5}, kInt)); + Placeholder extra(BufHandle("extra", {5}, kInt)); auto loops = l.getLoopStmtsFor(c); int j = 0; for (auto* l : loops) { // Add an increment at each layer of the loop which counts the number of // times the loop executes. - Load* load = new Load(extra, {new IntImm(j)}, new IntImm(1)); + Load* load = new Load(extra.data(), {new IntImm(j)}, new IntImm(1)); Add* add = new Add(load, new IntImm(1)); - Stmt* store = Store::make(extra, {j}, ExprHandle(add), 1); + Stmt* store = new Store(extra.data(), {new IntImm(j)}, add, new IntImm(1)); if (prepend) { l->body()->prepend_stmt(store); } @@ -2301,22 +2337,22 @@ void testLoopNestReorderInternalLoopNest() { const int M = 4; const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N}); - Buffer b_buf("b", kFloat, {N, K}); - Buffer c_buf("c", kFloat, {M, N}); - Buffer d_buf("d", kFloat, {M, K}); + Placeholder a_buf("a", kFloat, {M, N}); + Placeholder b_buf("b", kFloat, {N, K}); + Placeholder c_buf("c", kFloat, {M, N}); + Placeholder d_buf("d", kFloat, {M, K}); Tensor* x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n) * b_buf(n, k); + return a_buf.load(m, n) * b_buf.load(n, k); }); Tensor* y = Compute( "y", {{M, "m2"}, {N, "n2"}, {K, "k2"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c_buf(m, n) * d_buf(m, k) + x->call(m, n, k); + return c_buf.load(m, n) * d_buf.load(m, k) + x->call(m, n, k); }); Tensor* z = Compute( "z", @@ -2824,9 +2860,9 @@ void testNormalizeAndSplitWithTail() { // Create a dummy tensor to construct LoopNest. ExprHandle n(100); - Buffer a(BufHandle("a", {n}, kFloat)); + Placeholder a(BufHandle("a", {n}, kFloat)); Tensor* b = - Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a(i); }); + Compute("b", {{n, "i"}}, [&](const VarHandle& i) { return a.load(i); }); LoopNest l({b}); // Input IR: @@ -2872,9 +2908,10 @@ void testDetectInlineRankMismatch() { KernelScope kernel_scope; const int kTotalSize = 8; - Buffer a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); - Tensor* a = Compute( - "a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { return a_buf(i); }); + Placeholder a_buf(BufHandle("A", {ExprHandle(kTotalSize)}, kFloat)); + Tensor* a = Compute("a", {{kTotalSize, "i"}}, [&](const VarHandle& i) { + return a_buf.load(i); + }); Tensor* reshape = Compute( "reshape", {{kTotalSize / 2, "i"}, {2, "j"}}, @@ -2882,7 +2919,7 @@ void testDetectInlineRankMismatch() { LoopNest l({reshape}); ASSERT_THROWS_WITH( l.computeInline(l.getLoopBodyFor(a)), - "Buffer indexed access is inconsistent with its rank"); + "Placeholder indexed access is inconsistent with its rank"); } } // namespace jit diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index 2af8e33d3981..8436388f0d6b 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -7,9 +7,7 @@ #include "test/cpp/tensorexpr/padded_buffer.h" #include "torch/csrc/jit/tensorexpr/analysis.h" -#include "torch/csrc/jit/tensorexpr/buffer.h" #include "torch/csrc/jit/tensorexpr/eval.h" -#include "torch/csrc/jit/tensorexpr/function.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/ir_printer.h" #include "torch/csrc/jit/tensorexpr/ir_simplifier.h" @@ -25,7 +23,7 @@ using namespace torch::jit::tensorexpr; void testReduceSum1D() { KernelScope kernel_scope; - Buffer b(BufHandle("b", {10}, kFloat)); + Placeholder b(BufHandle("b", {10}, kFloat)); std::vector in(10); for (int j = 0; j < 10; ++j) { in[j] = j; @@ -54,7 +52,7 @@ void testReduceSum2D() { VarHandle m("m", kInt); VarHandle n("n", kInt); - Buffer b(BufHandle("b", {m, n}, kFloat)); + Placeholder b(BufHandle("b", {m, n}, kFloat)); std::vector in(M * N); for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { @@ -92,7 +90,7 @@ void testReduceSum3D() { const int M = 10; VarHandle m("m", kInt); - Buffer b(BufHandle("b", {2, 3, m}, kFloat)); + Placeholder b(BufHandle("b", {2, 3, m}, kFloat)); Tensor* c = Reduce("sum", {{2, "l"}, {3, "n"}}, Sum(), b, {{m, "m"}}); LoopNest loop({c}); @@ -140,7 +138,7 @@ void testReduceSum3D() { } // This is the same as just reducing the original result across that axis. - Buffer c_buf(BufHandle(c->func_var())); + Placeholder c_buf(BufHandle(c->buf())); Tensor* e = Reduce("sum3", {{2, "l"}}, Sum(), c_buf, {{3, "m"}}); LoopNest loop3({e}); loop3.prepareForCodegen(); @@ -159,9 +157,9 @@ void testReduceSum3D() { void testReduceSum10D() { KernelScope kernel_scope; - Buffer in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat)); + Placeholder in_(BufHandle("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat)); const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3; - Buffer out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat)); + Placeholder out_(BufHandle("out_", {2, 3, 2, 3, 2}, kFloat)); const int OutputSize = 2 * 3 * 2 * 3 * 2; std::vector in(InputSize, 1.f); @@ -195,7 +193,7 @@ void testReduceProduct() { const int M = 4; const int N = 4; - Buffer b(BufHandle("b", {M, N}, kFloat)); + Placeholder b(BufHandle("b", {M, N}, kFloat)); std::vector in(M * N); for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { @@ -232,7 +230,7 @@ void testReduceProduct() { void testReduceMax() { KernelScope kernel_scope; - Buffer in_(BufHandle("b", {10}, kFloat)); + Placeholder in_(BufHandle("b", {10}, kFloat)); std::vector in(10); std::vector out(1, -1.f); @@ -252,7 +250,7 @@ void testReduceMax() { ASSERT_EQ(out[0], 9); - Buffer in2_(BufHandle("b", {2, 5}, kFloat)); + Placeholder in2_(BufHandle("b", {2, 5}, kFloat)); std::vector out2(2, -1.f); Tensor* m2d = Reduce("max", {{2, "n"}}, Maximum(kFloat), in2_, {{5, "m"}}); @@ -274,7 +272,7 @@ void testReduceMinCustomInitializer() { KernelScope kernel_scope; VarHandle minInit("minInit", kFloat); - Buffer in_(BufHandle("b", {10}, kFloat)); + Placeholder in_(BufHandle("b", {10}, kFloat)); std::vector in(10); std::vector out(1, -1.f); @@ -286,7 +284,7 @@ void testReduceMinCustomInitializer() { "min", {}, Minimum(ExprHandle(minInit)), - [&](ParameterList& v) { return in_.call(v); }, + [&](ParameterList& v) { return in_.load(v); }, {{10, "m"}}); LoopNest loop({min}); @@ -312,7 +310,7 @@ void testReduceAnyAll() { KernelScope kernel_scope; VarHandle searchValue("searchValue", kInt); - Buffer b(BufHandle("b", {4, 10}, kInt)); + Placeholder b(BufHandle("b", {4, 10}, kInt)); Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) { return CompareSelect::make(a, 1, 1, b, kEQ); @@ -323,7 +321,7 @@ void testReduceAnyAll() { {{4, "i"}}, anyEqSV, [&](const auto& i, const auto& j) { - return CompareSelect::make(b(i, j), searchValue, kEQ); + return CompareSelect::make(b.load(i, j), searchValue, kEQ); }, {{10, "j"}}); @@ -366,7 +364,7 @@ void testReduceAnyAll() { {{4, "i"}}, allGTSV, [&](const auto& i, const auto& j) { - return CompareSelect::make(b(i, j), searchValue, kGT); + return CompareSelect::make(b.load(i, j), searchValue, kGT); }, {{10, "j"}}); @@ -397,8 +395,8 @@ void testReduceAnyAll() { void testReduceMatmul2D() { KernelScope kernel_scope; - Buffer tA(BufHandle("tA", {3, 2}, kFloat)); - Buffer tB(BufHandle("tB", {2, 3}, kFloat)); + Placeholder tA(BufHandle("tA", {3, 2}, kFloat)); + Placeholder tB(BufHandle("tB", {2, 3}, kFloat)); std::vector tA_(6); std::vector tB_(6); @@ -416,7 +414,7 @@ void testReduceMatmul2D() { {{3, "m"}, {3, "n"}}, Sum(), [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { - return tA(m, k) * tB(k, n); + return tA.load(m, k) * tB.load(k, n); }, {{2, "k"}}); @@ -439,7 +437,7 @@ void testReduceMatmul2D() { void testReduceRfactorLike() { KernelScope kernel_scope; - Buffer in(BufHandle("in", {10, 10}, kFloat)); + Placeholder in(BufHandle("in", {10, 10}, kFloat)); std::vector in_(100); for (int i = 0; i < 100; ++i) { in_[i] = i; @@ -448,7 +446,7 @@ void testReduceRfactorLike() { std::vector out(1, -1.f); Tensor* l1 = Reduce("l1", {{10, "i"}}, Sum(), in, {{10, "j"}}); - Buffer in_rf(BufHandle(l1->func_var())); + Placeholder in_rf(BufHandle(l1->buf())); Tensor* l2 = Reduce("l2", {}, Sum(), in_rf, {{10, "i"}}); @@ -469,15 +467,15 @@ void testReduceAsProducer() { const int M = 10; VarHandle m("m", kInt); - Buffer a(BufHandle("a", {2, 3}, kFloat)); - Buffer b(BufHandle("b", {2, 3, m}, kFloat)); + Placeholder a(BufHandle("a", {2, 3}, kFloat)); + Placeholder b(BufHandle("b", {2, 3, m}, kFloat)); Tensor* c = Reduce("sum", {{2, "l1"}, {3, "n1"}}, Sum(), b, {{m, "m1"}}); Tensor* d = Compute( "scale", {{2, "l2"}, {3, "n1"}}, [&](const VarHandle& l, const VarHandle& n) { - return c->call(l, n) * a(l, n); + return c->call(l, n) * a.load(l, n); }); LoopNest loop({d}); loop.prepareForCodegen(); @@ -513,14 +511,14 @@ void testReduceAsConsumer() { const int M = 10; VarHandle m("m", kInt); - Buffer a(BufHandle("a", {2, 3, m}, kFloat)); - Buffer b(BufHandle("b", {2, 3, m}, kFloat)); + Placeholder a(BufHandle("a", {2, 3, m}, kFloat)); + Placeholder b(BufHandle("b", {2, 3, m}, kFloat)); Tensor* c = Compute( "scale", {{2, "l2"}, {3, "n1"}, {m, "m1"}}, [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { - return b(l, n, m) * a(l, n, m); + return b.load(l, n, m) * a.load(l, n, m); }); Tensor* d = Reduce("sum", {{2, "l1"}}, Sum(), c, {{3, "n1"}, {m, "m1"}}); LoopNest loop({d}); @@ -559,7 +557,7 @@ void testReduceAsConsumer() { void testSplitReduceAxis() { KernelScope kernel_scope; - Buffer in(BufHandle("in", {16, 8}, kFloat)); + Placeholder in(BufHandle("in", {16, 8}, kFloat)); std::vector in_(16 * 8); for (int i = 0; i < 16; ++i) { @@ -593,7 +591,7 @@ void testSplitReduceAxis() { void testSplitNonReduceAxis() { KernelScope kernel_scope; - Buffer in(BufHandle("in", {16, 8}, kFloat)); + Placeholder in(BufHandle("in", {16, 8}, kFloat)); std::vector in_(16 * 8); for (int i = 0; i < 16; ++i) { @@ -637,7 +635,7 @@ void testReorderedReductionInitializer() { SumOp(c(k, n), 0, a(k, m, n), {m}) */ - Buffer in(BufHandle("in", {1, 12, 6}, kFloat)); + Placeholder in(BufHandle("in", {1, 12, 6}, kFloat)); std::vector in_(12 * 6, 1.f); Tensor* tensor_ = Reduce("sum", {{1, "k"}, {12, "n"}}, Sum(), in, {{6, "m"}}); @@ -685,7 +683,7 @@ void testReduceRfactor() { VarHandle m("m", kInt); VarHandle n("n", kInt); - Buffer b(BufHandle("b", {m, n}, kFloat)); + Placeholder b(BufHandle("b", {m, n}, kFloat)); std::vector in(M * N); for (int j = 0; j < M * N; ++j) { in[j] = j; @@ -720,7 +718,7 @@ void testReduce3DRfactorInternal() { VarHandle n("n", kInt); VarHandle k("k", kInt); - Buffer b(BufHandle("b", {m, n, k}, kFloat)); + Placeholder b(BufHandle("b", {m, n, k}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -755,7 +753,7 @@ void testReduce3DRfactorInner() { VarHandle n("n", kInt); VarHandle k("k", kInt); - Buffer b(BufHandle("b", {m, n, k}, kFloat)); + Placeholder b(BufHandle("b", {m, n, k}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -790,7 +788,7 @@ void testReduce3DRfactorOuter() { VarHandle n("n", kInt); VarHandle k("k", kInt); - Buffer b(BufHandle("b", {m, n, k}, kFloat)); + Placeholder b(BufHandle("b", {m, n, k}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -826,7 +824,7 @@ void testReduce3DRfactorWithOuter() { VarHandle n("n", kInt); VarHandle k("k", kInt); - Buffer b(BufHandle("b", {l, m, n, k}, kFloat)); + Placeholder b(BufHandle("b", {l, m, n, k}, kFloat)); std::vector in(L * M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -861,7 +859,7 @@ void testReduce3DRfactorRepeated() { VarHandle n("n", kInt); VarHandle k("k", kInt); - Buffer b(BufHandle("b", {m, n, k}, kFloat)); + Placeholder b(BufHandle("b", {m, n, k}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -907,7 +905,7 @@ void testReduceRfactorInsertionPoint() { VarHandle m("m", kInt); VarHandle n("n", kInt); - Buffer b(BufHandle("b", {m, n}, kFloat)); + Placeholder b(BufHandle("b", {m, n}, kFloat)); std::vector in(M * N); for (int j = 0; j < M * N; ++j) { in[j] = j; @@ -942,7 +940,7 @@ void testReduce3DRfactorInsertionPoint() { VarHandle n("n", kInt); VarHandle k("k", kInt); - Buffer b(BufHandle("b", {m, n, k}, kFloat)); + Placeholder b(BufHandle("b", {m, n, k}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -969,7 +967,7 @@ void testReduce3DRfactorInsertionPoint() { void testReduceRepeatedInternalRfactor() { KernelScope kernel_scope; - Buffer in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat)); + Placeholder in_(BufHandle("in_", {2, 3, 4, 5, 6}, kFloat)); const int InputSize = 2 * 3 * 4 * 5 * 6; std::vector in(InputSize, 1.f); @@ -1020,7 +1018,7 @@ void testReduceSplitTail() { const int N = 10; const int K = 10; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -1053,7 +1051,7 @@ void testReduceSplitNoTail() { const int M = 10; const int N = 10; const int K = 10; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -1088,7 +1086,7 @@ void testReduceOverSplitTail() { const int N = 10; const int K = 10; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -1122,7 +1120,7 @@ void testReduceSplitMask() { const int N = 10; const int K = 10; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -1155,7 +1153,7 @@ void testReduceSplitNoMask() { const int M = 10; const int N = 10; const int K = 10; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -1189,7 +1187,7 @@ void testReduceOverSplitMask() { const int N = 10; const int K = 10; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int j = 0; j < M * N * K; ++j) { in[j] = j; @@ -1225,7 +1223,7 @@ void testReduceSplitRfactor() { const int K = 10; const int SPLIT_FACTOR = 4; - Buffer b(BufHandle("b", {M, N, K}, kFloat)); + Placeholder b(BufHandle("b", {M, N, K}, kFloat)); std::vector in(M * N * K); for (int m = 0; m < M; ++m) { for (int j = 0; j < N * K; ++j) { @@ -1264,7 +1262,7 @@ void testReduceOverSplitRfactor() { const int K = 10; const int SPLIT_FACTOR = 16; - Buffer b(BufHandle("b", {N, K}, kFloat)); + Placeholder b(BufHandle("b", {N, K}, kFloat)); std::vector in(N * K); for (int j = 0; j < N * K; ++j) { in[j] = j; @@ -1314,12 +1312,12 @@ void testReduceInlineReduction() { const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M}); - Buffer b_buf("b", kFloat, {M, N, K}); + Placeholder a_buf("a", kFloat, {M}); + Placeholder b_buf("b", kFloat, {M, N, K}); Tensor* x = Reduce("x", {{M, "m1"}}, Sum(), b_buf, {{N, "n1"}, {K, "k1"}}); Tensor* y = Compute("y", {{M, "m2"}}, [&](const VarHandle& m) { - return a_buf(m) + x->call(m); + return a_buf.load(m) + x->call(m); }); PaddedBuffer a_v(M); @@ -1347,14 +1345,14 @@ void testReduceInlineConsumer() { const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N, K}); - Buffer b_buf("b", kFloat, {M, N, K}); + Placeholder a_buf("a", kFloat, {M, N, K}); + Placeholder b_buf("b", kFloat, {M, N, K}); Tensor* x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n, k) + b_buf(m, n, k); + return a_buf.load(m, n, k) + b_buf.load(m, n, k); }); Tensor* y = Reduce("y", {{M, "m2"}}, Sum(), x, {{N, "n2"}, {K, "k2"}}); @@ -1401,14 +1399,14 @@ void testReduceInlineReducerInternal() { const int N = 5; const int K = 6; - Buffer a_buf("a", kFloat, {M, N, K}); - Buffer b_buf("b", kFloat, {M, N, K}); + Placeholder a_buf("a", kFloat, {M, N, K}); + Placeholder b_buf("b", kFloat, {M, N, K}); Tensor* x = Compute( "x", {{M, "m1"}, {N, "n1"}, {K, "k1"}}, [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf(m, n, k) + b_buf(m, n, k); + return a_buf.load(m, n, k) + b_buf.load(m, n, k); }); Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) { diff --git a/test/cpp/tensorexpr/test_registerizer.cpp b/test/cpp/tensorexpr/test_registerizer.cpp index e7a28f1fb277..b286ab7b8151 100644 --- a/test/cpp/tensorexpr/test_registerizer.cpp +++ b/test/cpp/tensorexpr/test_registerizer.cpp @@ -13,7 +13,7 @@ using namespace torch::jit::tensorexpr; // Can replace a simple scalar access with a local variable. void testRegisterizerSimple() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({Store::make(a, {0}, 0, 1), @@ -58,7 +58,7 @@ void testRegisterizerSimple() { // Won't do replacement of a loop access. void testRegisterizerLoop() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {10}, kInt)); + BufHandle a("A", {10}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({Store::make(a, {0}, 0, 1), @@ -105,7 +105,7 @@ void testRegisterizerLoop() { // invalidate it. void testRegisterizerLoopFixedLoad() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({Store::make(a, {0}, 0, 1), @@ -151,7 +151,7 @@ void testRegisterizerLoopFixedLoad() { // Will registerize multiple accesses of different items of the same buffer. void testRegisterizerMultiVar() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {2}, kInt)); + BufHandle a("A", {2}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({ Store::make(a, {0}, 0, 1), @@ -207,8 +207,8 @@ void testRegisterizerMultiVar() { // Will registerize the valid accesses while skipping invalid replacements. void testRegisterizerVariableLoad() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {10}, kInt)); + BufHandle a("A", {1}, kInt); + BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); VarHandle x2("x", kInt); Stmt* stmt = Block::make( @@ -268,7 +268,7 @@ void testRegisterizerSymbolicIndices() { KernelScope kernel_scope; VarHandle i("i", kInt); VarHandle N("N", kInt); - Buffer a(BufHandle("A", {N}, kInt)); + BufHandle a("A", {N}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({Store::make(a, {i}, 0, 1), @@ -317,7 +317,7 @@ void testRegisterizerSymbolicIndices() { // yet. Will have to fix soon though. void testRegisterizerEarlyStop() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make( {Store::make(a, {0}, 0, 1), @@ -344,7 +344,7 @@ void testRegisterizerEarlyStop() { // Can registerize accesses dependent on multiple loop vars. void testRegisterizerMultiLoop() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); VarHandle y("y", kInt); Stmt* stmt = Block::make( @@ -402,7 +402,7 @@ void testRegisterizerMultiLoop() { // Can registerize correctly if scalars already exist in the program. void testRegisterizerRepeated() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {2}, kInt)); + BufHandle a("A", {2}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({ Store::make(a, {0}, 0, 1), @@ -458,7 +458,7 @@ void testRegisterizerRepeated() { // Can registerize rthe load of A. void testRegisterizerNoLoads() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make( {Store::make(a, {0}, 0, 1), @@ -499,8 +499,8 @@ void testRegisterizerNoLoads() { // Can registerize the load of A but not the store of B. void testRegisterizerNoRepeatedStores() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {10}, kInt)); + BufHandle a("A", {1}, kInt); + BufHandle b("B", {10}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({Store::make(a, {0}, 0, 1), @@ -548,7 +548,7 @@ void testRegisterizerNoRepeatedStores() { // Won't registerize if there are multiple accesses which may overlap. void testRegisterizerMultiVarOverlap() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {2}, kInt)); + BufHandle a("A", {2}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({ Store::make(a, {0}, 0, 1), @@ -578,12 +578,12 @@ void testRegisterizerMultiVarOverlap() { void testRegisterizerAllocs() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {2}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); - Buffer c(BufHandle("C", {1}, kInt)); + BufHandle a("A", {2}, kInt); + BufHandle b("B", {1}, kInt); + BufHandle c("C", {1}, kInt); VarHandle x("x", kInt); - VarHandle b_(b.data()->base_handle()); + VarHandle b_(b.node()->base_handle()); Stmt* stmt = Block::make( {Allocate::make(b_, kInt, {Load::make(c, {0}, 1)}), @@ -646,7 +646,7 @@ void testRegisterizerAllocs() { void testRegisterizerNoInitializer() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({For::make( x, @@ -687,8 +687,8 @@ void testRegisterizerNoInitializer() { void testRegisterizerLoadThenStore() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); + BufHandle a("A", {1}, kInt); + BufHandle b("B", {1}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({For::make( x, @@ -737,7 +737,7 @@ void testRegisterizerLoadThenStore() { void testRegisterizerParallelized() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); VarHandle x("x", kInt); LoopOptions loopOpts; loopOpts.set_gpu_block_index(0); @@ -765,7 +765,7 @@ void testRegisterizerParallelized() { void testRegisterizerConditions() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {5}, kInt)); + BufHandle a("A", {5}, kInt); VarHandle x("x", kInt); Stmt* stmt = Block::make({For::make( x, diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index b88aa17efd3e..22cd89a33c30 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -504,9 +504,9 @@ void testHashDifferenceTypes() { void testHashLargeExpression() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + BufHandle a("A", {N}, kInt); + BufHandle b("B", {N}, kInt); + BufHandle c("C", {N}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto memcpy_stmt = For::make( @@ -522,8 +522,8 @@ void testHashLargeExpression() { CompareSelectOperation::kEQ), mask)); - Buffer d(BufHandle("D", {1}, kInt)); - Buffer e(BufHandle("E", {1}, kInt)); + BufHandle d("D", {1}, kInt); + BufHandle e("E", {1}, kInt); auto store_ramp_stmt = Store::make( e, {Ramp::make(0, 1, 4)}, @@ -555,9 +555,9 @@ void testHashLargeExpression() { void testHashForLoopOptions() { KernelScope kernel_scope; constexpr int N = 1024; - Buffer a(BufHandle("A", {N}, kInt)); - Buffer b(BufHandle("B", {N}, kInt)); - Buffer c(BufHandle("C", {N}, kInt)); + BufHandle a("A", {N}, kInt); + BufHandle b("B", {N}, kInt); + BufHandle c("C", {N}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto for_stmt = For::make( @@ -2632,8 +2632,8 @@ void testSimplifyConstantCond() { { // If the condition is constant true then take the true_value. // 1 ? A[0] = 1 : B[0] = 1 => A[0] = 1 - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); + BufHandle a("A", {1}, kInt); + BufHandle b("B", {1}, kInt); ExprHandle condition(1); Stmt* true_val = Store::make(a, {0}, 1, 1); Stmt* false_val = Store::make(b, {0}, 1, 1); @@ -2648,8 +2648,8 @@ void testSimplifyConstantCond() { { // If the condition is constant false then take the false_value. // 0 ? A[0] = 1 : B[0] = 1 => B[0] = 1 - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); + BufHandle a("A", {1}, kInt); + BufHandle b("B", {1}, kInt); ExprHandle condition(0); Stmt* true_val = Store::make(a, {0}, 1, 1); Stmt* false_val = Store::make(b, {0}, 1, 1); @@ -2665,8 +2665,8 @@ void testSimplifyConstantCond() { // condition is simplified before checking. // (x-x) ? A[0] = 1 : B[0] = 1 => B[0] = 1 VarHandle x("x", kInt); - Buffer a(BufHandle("A", {1}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); + BufHandle a("A", {1}, kInt); + BufHandle b("B", {1}, kInt); ExprHandle condition(x - x); Stmt* true_val = Store::make(a, {0}, 1, 1); Stmt* false_val = Store::make(b, {0}, 1, 1); @@ -2682,7 +2682,7 @@ void testSimplifyConstantCond() { // If both branches are the same then don't do the condition. // x ? A[0] = x : A[0] = x => A[0] = x VarHandle x("x", kInt); - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); ExprHandle condition(x - x); Stmt* true_val = Store::make(a, {0}, x, 1); Stmt* false_val = Store::make(a, {0}, x, 1); @@ -2698,7 +2698,7 @@ void testSimplifyConstantCond() { // If both branches simplify to the same thing it still works. // x ? (x + x) : (2 * x) => x VarHandle x("x", kInt); - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); ExprHandle condition(x - x); Stmt* true_val = Store::make(a, {0}, ExprHandle(2) * x, 1); Stmt* false_val = Store::make(a, {0}, x + x, 1); @@ -2714,7 +2714,7 @@ void testSimplifyConstantCond() { // But not if they dont // x ? x : (2 * x) => x ? x : (2 * x) VarHandle x("x", kInt); - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); ExprHandle condition(x); Stmt* true_val = Store::make(a, {0}, x, 1); Stmt* false_val = Store::make(a, {0}, ExprHandle(2) * x, 1); @@ -2771,8 +2771,8 @@ void testSimplifyEliminateZeroLengthFor() { { // Will eliminate zero loop For. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2784,8 +2784,8 @@ void testSimplifyEliminateZeroLengthFor() { { // still works if start is not zero. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2798,8 +2798,8 @@ void testSimplifyEliminateZeroLengthFor() { { // works if both terms are variable. VarHandle x("x", kInt); - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2812,8 +2812,8 @@ void testSimplifyEliminateZeroLengthFor() { { // works if one term simplifies down. VarHandle x("x", kInt); - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = For::make( @@ -2825,8 +2825,8 @@ void testSimplifyEliminateZeroLengthFor() { { // Sanity check does nothing if the condition is not met. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2841,8 +2841,8 @@ void testSimplifyOneLoopFor() { { // Will remove the loop if the body is run once. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2856,8 +2856,8 @@ void testSimplifyOneLoopFor() { { // still works if start is not zero. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2872,8 +2872,8 @@ void testSimplifyOneLoopFor() { { // works if both terms are variable. VarHandle x("x", kInt); - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = For::make( @@ -2888,8 +2888,8 @@ void testSimplifyOneLoopFor() { { // works if one term simplifies down. VarHandle x("x", kInt); - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = For::make( @@ -2903,8 +2903,8 @@ void testSimplifyOneLoopFor() { { // Sanity check does nothing if the condition is not met. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); auto body = @@ -2919,8 +2919,8 @@ void testSimplifyForWontLoseLoopOptions() { { // Sanity check does nothing if the condition is not met. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); LoopOptions options; @@ -2939,8 +2939,8 @@ void testSimplifyMultilevelFor() { { // Multiple layers of For will be simplified out. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -2956,8 +2956,8 @@ void testSimplifyMultilevelFor() { { // Will maintain an outer loop if the inner loop is eliminated. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -2979,8 +2979,8 @@ void testSimplifyMultilevelFor() { { // Will maintain inner loop if outer loops is eliminated. - Buffer a(BufHandle("A", {4}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -3003,7 +3003,7 @@ void testSimplifyForCleansUp() { KernelScope kernel_scope; { - Buffer a("a", kFloat, {1, 12, 1}); + Placeholder a("a", kFloat, {1, 12, 1}); VarHandle x("x", kInt); Tensor* b = Compute( "x", @@ -3051,7 +3051,7 @@ void testSimplifyFlattenBlock() { { // Flatten multiple blocks down to one. // { { { stmt1, stmt2 } } } => { stmt1, stmt2 } - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); Store* store1 = Store::make(a, {0}, 1, 1); Store* store2 = Store::make(a, {0}, 0, 1); @@ -3074,7 +3074,7 @@ void testSimplifyFlattenBlock() { { // Flatten multiple sub blocks containing statements. // { { stmt1 }, { stmt2 } } => { stmt1, stmt2 } - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); Store* store1 = Store::make(a, {0}, 1, 1); Store* store2 = Store::make(a, {0}, 0, 1); @@ -3097,7 +3097,7 @@ void testSimplifyFlattenBlock() { { // Flatten sub blocks with different depths. // { stmt1 , { { stmt2 } } } => { stmt1, stmt2 } - Buffer a(BufHandle("A", {1}, kInt)); + BufHandle a("A", {1}, kInt); Store* store1 = Store::make(a, {0}, 1, 1); Store* store2 = Store::make(a, {0}, 0, 1); @@ -3240,9 +3240,9 @@ void testDontSimplifyRand() { void testSimplifyReorderForCond() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {4}, kInt)); - Buffer b(BufHandle("B", {1}, kInt)); - Buffer c(BufHandle("C", {4}, kInt)); + BufHandle a("A", {4}, kInt); + BufHandle b("B", {1}, kInt); + BufHandle c("C", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -3440,8 +3440,8 @@ void testSimplifyReorderForCond() { void testSimplifyFuseConditions() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {2}, kInt)); - Buffer b(BufHandle("B", {2}, kInt)); + BufHandle a("A", {2}, kInt); + BufHandle b("B", {2}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); VarHandle j("j", kInt); @@ -3858,7 +3858,7 @@ void testSimplifyFuseConditions() { void testSimplifySyncThreads() { KernelScope kernel_scope; - Buffer a(BufHandle("A", {4}, kInt)); + BufHandle a("A", {4}, kInt); auto mask = IntImm::make(1); VarHandle i("i", kInt); @@ -3950,5 +3950,45 @@ void testSimplifySyncThreads() { } } +void testSimplifyRampSubBroadcast() { + KernelScope kernel_scope; + int num_lanes = 4; + ExprHandle ramp = Ramp::make(ExprHandle(0), ExprHandle(6), num_lanes); + ExprHandle broadcast = Broadcast::make(ExprHandle(-5), num_lanes); + ExprHandle simplified = IRSimplifier::simplify(ramp - broadcast); + Ramp* newRamp = simplified.AsNode(); + IS_NODE_WITH_NAME(IntImm, newRamp->base(), base); + ASSERT_EQ(base->value(), 5); + IS_NODE_WITH_NAME(IntImm, newRamp->stride(), stride); + ASSERT_EQ(stride->value(), 6); + ASSERT_EQ(newRamp->lanes(), num_lanes); +} + +void testSimplifyBroadcastTermExpander() { + KernelScope kernel_scope; + int num_lanes = 8; + ExprHandle bc0 = Broadcast::make(ExprHandle(0), num_lanes); + ExprHandle bc1 = Broadcast::make(ExprHandle(1), num_lanes); + ExprHandle bc2 = Broadcast::make(ExprHandle(2), num_lanes); + // NB: We need a term in the middle which isn't simplified to trigger the + // relevant path in TermExpander::mutate. The two bc1 terms are brought + // together and simplified to 2 * bc1, which then needs to make 2 multi-lane. + ExprHandle simplified = IRSimplifier::simplify(bc1 + (bc0 / bc2) + bc1); + BufHandle buf("buf", {num_lanes}, kInt); + // The result isn't fully simplified currently and thus would be brittle to + // match. Observe its value instead. + auto store = Store::make( + buf, + {Ramp::make(0, 1, num_lanes)}, + simplified, + Broadcast::make(ExprHandle(1), num_lanes)); + SimpleIREvaluator eval(store, buf); + std::vector output(num_lanes); + eval(output); + for (int i = 0; i < num_lanes; ++i) { + ASSERT_EQ(output[i], 2); + } +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index 680311685375..826cf7209346 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -151,7 +151,7 @@ void testFuserPass_UnknownShapes() { %y : Tensor): %a : Tensor = aten::mul(%x, %y) %b : Tensor = aten::mul(%x, %a) - return (%a))IR"; + return (%b))IR"; auto g = std::make_shared(); torch::jit::parseIR(graph_string, g.get()); @@ -311,5 +311,24 @@ void testFuserPass_MergeGroups() { ->run(*g); } +void testFuserPass_UnknownShapesIgnored() { + WithCPUFuser cf; + KernelScope kernel_scope; + const auto graph_string = R"IR( + graph(%x : Float(device=cpu), + %y : Float(device=cpu)): + %a : Float(device=cpu) = aten::mul(%x, %y) + %b : Float(device=cpu) = aten::mul(%x, %a) + return (%b))IR"; + auto g = std::make_shared(); + torch::jit::parseIR(graph_string, g.get()); + + g->lint(); + FuseTensorExprs(g, /* min_group_size= */ 2, /* disable_shape_checks= */ true); + + // Test that we are generating fusion groups even though shapes are not known + testing::FileCheck().check("prim::TensorExprGroup")->run(*g); +} + } // namespace jit } // namespace torch diff --git a/test/cpp/tensorexpr/test_train.cpp b/test/cpp/tensorexpr/test_train.cpp index aa2426050324..755d482dc2b4 100644 --- a/test/cpp/tensorexpr/test_train.cpp +++ b/test/cpp/tensorexpr/test_train.cpp @@ -2,9 +2,7 @@ #include "test/cpp/tensorexpr/padded_buffer.h" #include "test/cpp/tensorexpr/test_base.h" #include "test/cpp/tensorexpr/test_utils.h" -#include "torch/csrc/jit/tensorexpr/buffer.h" #include "torch/csrc/jit/tensorexpr/eval.h" -#include "torch/csrc/jit/tensorexpr/function.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/ir_printer.h" #include "torch/csrc/jit/tensorexpr/loopnest.h" @@ -56,7 +54,7 @@ void testTrainBasic() { auto C = call("mul", {A, B})[0]; Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -85,7 +83,7 @@ void testTrainBasic() { auto dA = grad(D, A, ones); Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -117,7 +115,7 @@ void testTrainBasic() { auto C = A + B; Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -146,7 +144,7 @@ void testTrainBasic() { auto dA = D.grad(A, ones); Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -181,7 +179,7 @@ void testTrainBasic() { auto dC = (C * C).grad(B, ones); Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -209,7 +207,7 @@ void testTrainBasic() { auto X = T(g, {"K"}); auto Y = X.sum(); Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -229,7 +227,7 @@ void testTrainBasic() { auto Y = X.sum(); auto Z = Y.broadcast_like(X); Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -266,7 +264,7 @@ void testTrainBasic() { auto new_W = W - W_grad; Stmt* s; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; diff --git a/test/cpp/tensorexpr/test_train.h b/test/cpp/tensorexpr/test_train.h index 39674933aa9c..16ff667860d0 100644 --- a/test/cpp/tensorexpr/test_train.h +++ b/test/cpp/tensorexpr/test_train.h @@ -37,7 +37,7 @@ VTensor* grad(VTensor* y, VTensor* x, VTensor* j); std::string dot(const VGraph& g); std::tuple< torch::jit::tensorexpr::Stmt*, - std::map, + std::map, std::map, std::map> to_tensorexpr(const VGraph& graph, std::vector outputs = {}); diff --git a/test/cpp/tensorexpr/test_train_impl.cpp b/test/cpp/tensorexpr/test_train_impl.cpp index 1636b583cef9..b9b7d33b129b 100644 --- a/test/cpp/tensorexpr/test_train_impl.cpp +++ b/test/cpp/tensorexpr/test_train_impl.cpp @@ -1,8 +1,6 @@ #include "test/cpp/tensorexpr/test_train.h" #include "test/cpp/tensorexpr/test_utils.h" -#include "torch/csrc/jit/tensorexpr/buffer.h" #include "torch/csrc/jit/tensorexpr/eval.h" -#include "torch/csrc/jit/tensorexpr/function.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/ir_printer.h" #include "torch/csrc/jit/tensorexpr/loopnest.h" @@ -408,7 +406,7 @@ std::string dot(const VGraph& g) { std::tuple< Stmt*, - std::map, + std::map, std::map, std::map> to_tensorexpr(const VGraph& graph, std::vector outputs) { @@ -458,7 +456,7 @@ to_tensorexpr(const VGraph& graph, std::vector outputs) { return order; }; - std::map inputs; + std::map inputs; std::map bindings; std::map vbindings; @@ -481,10 +479,10 @@ to_tensorexpr(const VGraph& graph, std::vector outputs) { if (vars.size() == 0) { vars.emplace_back(IntImm::make(1)); } - Buffer inpB(BufHandle(get_name(id), exprs, kFloat)); + Placeholder inpB(BufHandle(get_name(id), exprs, kFloat)); auto inpT = Compute("input" + get_name(id), vars, [&](const VarHandle& i) { - return Load::make(inpB, {i}, 1); + return Load::make(BufHandle(inpB.data()), {i}, 1); }); inputs.emplace(&t, inpB); bindings.emplace(&t, inpT); diff --git a/test/cpp/tensorexpr/tests.h b/test/cpp/tensorexpr/tests.h index 20206a348d25..dc21373f241f 100644 --- a/test/cpp/tensorexpr/tests.h +++ b/test/cpp/tensorexpr/tests.h @@ -23,6 +23,7 @@ namespace jit { _(ExprLongTest) \ _(ExprHalfTest) \ _(ExprDoubleTest) \ + _(ExprDisallowBoolArithmetic) \ _(ExprVectorAdd01) \ _(ExprCompareSelectEQ) \ _(ExprCompareSelectDtypes) \ @@ -55,6 +56,7 @@ namespace jit { _(ExprSplitWithTail) \ _(ExprSplitWithTailNone) \ _(ExprSplitWithMask01) \ + _(ExprSplitWithMaskRepeatedNoMask) \ _(SplitWithTailWithLoopOptions) \ _(SplitWithMaskWithLoopOptions) \ _(ScheduleBroadcastAddBuffer) \ @@ -216,6 +218,8 @@ namespace jit { _(SimplifyReorderForCond) \ _(SimplifyFuseConditions) \ _(SimplifySyncThreads) \ + _(SimplifyRampSubBroadcast) \ + _(SimplifyBroadcastTermExpander) \ _(RegisterizerSimple) \ _(RegisterizerLoop) \ _(RegisterizerLoopFixedLoad) \ @@ -291,6 +295,7 @@ namespace jit { _(FuserPass_0DimInput) \ _(FuserPass_UnfusibleDevice) \ _(FuserPass_UnknownShapes) \ + _(FuserPass_UnknownShapesIgnored) \ _(FuserPass_Multidevice) \ _(FuserPass_MergeGroups) \ _(TrainBasic) @@ -440,6 +445,7 @@ namespace jit { _(CudaSigmoid) \ _(CudaHalfCast) \ _(CudaHalfSupport) \ + _(CudaHalfPropagation) \ _(CudaPrioritizeDependents) \ _(CudaMaskBlockDim) \ _(CudaMaskThreadDim) \ diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp new file mode 100644 index 000000000000..f0bcfc4c2485 --- /dev/null +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -0,0 +1,426 @@ +// *** Tensor Expressions *** +// +// This tutorial covers basics of NNC's tensor expressions, shows basic APIs to +// work with them, and outlines how they are used in the overall TorchScript +// compilation pipeline. This doc is permanently a "work in progress" since NNC +// is under active development and things change fast. +// +// This Tutorial's code is compiled in the standard pytorch build, and the +// executable can be found in `build/bin/tutorial_tensorexpr`. +// +// *** What is NNC *** +// +// NNC stands for Neural Net Compiler. It is a component of TorchScript JIT +// and it performs on-the-fly code generation for kernels, which are often a +// combination of multiple aten (torch) operators. +// +// When the JIT interpreter executes a torchscript model, it automatically +// extracts subgraphs from the torchscript IR graph for which specialized code +// can be JIT generated. This usually improves performance as the 'combined' +// kernel created from the subgraph could avoid unnecessary memory traffic that +// is unavoidable when the subgraph is interpreted as-is, operator by operator. +// This optimization is often referred to as 'fusion'. Relatedly, the process of +// finding and extracting subgraphs suitable for NNC code generation is done by +// a JIT pass called 'fuser'. +// +// *** What is TE *** +// +// TE stands for Tensor Expressions. TE is a commonly used approach for +// compiling kernels performing tensor (~matrix) computation. The idea behind it +// is that operators are represented as a mathematical formula describing what +// computation they do (as TEs) and then the TE engine can perform mathematical +// simplification and other optimizations using those formulas and eventually +// generate executable code that would produce the same results as the original +// sequence of operators, but more efficiently. +// +// NNC's design and implementation of TE was heavily inspired by Halide and TVM +// projects. +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace torch::jit::tensorexpr; + +int main(int argc, char* argv[]) { + // Memory management for tensor expressions is currently done with memory + // arenas. That is, whenever an object is created it registers itself in an + // arena and the object is kept alive as long as the arena is alive. When the + // arena gets destructed, it deletes all objects registered in it. + // + // The easiest way to set up a memory arena is to use `KernelScope` class - it + // is a resource guard that creates a new arena on construction and restores + // the previously set arena on destruction. + // + // We will create a kernel scope here, and thus we'll set up a mem arena for + // the entire tutorial. + KernelScope kernel_scope; + + std::cout << "*** Structure of tensor expressions ***" << std::endl; + { + // A tensor expression is a tree of expressions. Each expression has a type, + // and that type defines what sub-expressions it the current expression has. + // For instance, an expression of type 'Mul' would have a type 'kMul' and + // two subexpressions: LHS and RHS. Each of these two sub-expressions could + // also be a 'Mul' or some other expression. + // + // Let's construct a simple TE: + Expr* lhs = new IntImm(5); + Expr* rhs = new Var("x", kInt); + Expr* mul = new Mul(lhs, rhs); + std::cout << "Tensor expression: " << *mul << std::endl; + // Prints: Tensor expression: 5 * x + + // Here we created an expression representing a 5*x computation, where x is + // an int variable. + + // Another, probably a more convenient, way to construct tensor expressions + // is to use so called expression handles (as opposed to raw expressions + // like we did in the previous example). Expression handles overload common + // operations and allow us to express the same semantics in a more natural + // way: + ExprHandle l = 1; + ExprHandle r = Var::make("x", kInt); + ExprHandle m = l * r; + std::cout << "Tensor expression: " << *m.node() << std::endl; + // Prints: Tensor expression: 1 * x + + // In a similar fashion we could construct arbitrarily complex expressions + // using mathematical and logical operations, casts between various data + // types, and a bunch of intrinsics. + ExprHandle a = Var::make("a", kInt); + ExprHandle b = Var::make("b", kFloat); + ExprHandle c = Var::make("c", kFloat); + ExprHandle x = ExprHandle(5) * a + b / (sigmoid(c) - 3.0f); + std::cout << "Tensor expression: " << *x.node() << std::endl; + // Prints: Tensor expression: float(5 * a) + b / ((sigmoid(c)) - 3.f) + + // An ultimate purpose of tensor expressions is to optimize tensor + // computations, and in order to represent accesses to tensors data, there + // is a special kind of expression - a load. + // To construct a load we need two pieces: the base and the indices. The + // base of a load is a Buf expression, which could be thought of as a + // placeholder similar to Var, but with dimensions info. + // + // Let's construct a simple load: + BufHandle A("A", {ExprHandle(64), ExprHandle(32)}, kInt); + ExprHandle i = Var::make("i", kInt), j = Var::make("j", kInt); + ExprHandle load = Load::make(A.dtype(), A, {i, j}, /* mask= */ 1); + std::cout << "Tensor expression: " << *load.node() << std::endl; + // Prints: Tensor expression: A[i, j] + } + + std::cout << "*** Tensors, Functions, and Placeholders ***" << std::endl; + { + // A tensor computation is represented by objects of Tensor class and + // consists of the following pieces: + // - domain, which is specified by a Buf expression + // - an expression (or several expressions if we want to perform several + // independent computations over the same domain) for its elements, as a + // function of indices + // + // We use Function objects to represent this. Let's build one. + // + // First, we need to specify the domain, or dimensions in which the + // computation would be performed. Let's create a 64x32 domain: + std::vector dims = { + new IntImm(64), new IntImm(32)}; // IntImm stands for Integer Immediate + // and represents an integer constant + + // Next we need to create Function arguments. The arguments of a Function + // are Vars, and they play role of placeholders. The computation that the + // function would describe would use these arguments. + const Var* i = new Var("i", kInt); + const Var* j = new Var("j", kInt); + std::vector args = {i, j}; + + // Now we can define the function computations using these arguments. Let's + // create two computations, the first would add the arguments of the + // function, the second would multiply them. + Expr* func_body1 = new Mul(i, j); + Expr* func_body2 = new Add(i, j); + + // Finally, we pass all these pieces together to Function constructor: + Function* func = + new Function({"X", "Y"}, dims, args, {func_body1, func_body2}); + // Under the hood function constructor would create separate `Buf` + // expressions for each computation (which can be accessed via + // `func->func_var(idx)`) with the names specified by the first parameter of + // the constructor call. In our example two `Buf` variables will be created + // with names 'X' and 'Y', each of them would signify a domain of 64x32. + + // We can now print out our function: + std::cout << "Tensor function: " << *func << std::endl; + // Prints: + // Tensor function: Function F(i[64], j[32]) { + // X = i * j + // Y = i + j + // } + + // A Tensor refers to an individual computation defined by a Function. For + // instance, we could create a following tensor given the function above: + int output_idx = 0; // Used to index the computation + Tensor* X = new Tensor(func, output_idx); + std::cout << "Tensor computation: " << *X << std::endl; + // Prints: Tensor computation: Tensor X(i[64], j[32]) = i * j + + // Similarly to how we provide a more convenient way of using handles for + // constructing Exprs, Tensors also have a more convenient API for + // construction. It is based on Compute functions, which take a name: + // dimensions, and a lambda specifying the computation body: + Tensor* Z = Compute( + "Z", + {{64, "i"}, {32, "j"}}, + [](const VarHandle& i, const VarHandle& j) { return i / j; }); + std::cout << "Tensor computation: " << *Z << std::endl; + // Prints: Tensor computation: Tensor Z(i[64], j[32]) = i / j + + // Tensors might access other tensors and external placeholders in their + // expressions. It can be done like so: + Placeholder P("P", kFloat, {64, 32}); + Tensor* R = Compute( + "R", + {{64, "i"}, {32, "j"}}, + [&](const VarHandle& i, const VarHandle& j) { + return Z->call(i, j) * P.load(i, j); + }); + std::cout << "Tensor computation: " << *R << std::endl; + // Prints: Tensor computation: Tensor R(i[64], j[32]) = Z(i, j) * P[i, j] + + // Placeholders could be thought of as external tensors, i.e. tensors for + // which we don't have the element expression. In other words, for `Tensor` + // we know an expression specifying how its elements can be computed (a + // mathematical formula). For external tensors, or placeholders, we don't + // have such an expression. They need to be considered as coming to us as + // inputs from outside - we can only load data from them. + // + // Also note that we use 'call' to construct an access to an element of a + // Tensor and we use 'load' for accessing elements of an external tensor + // through its Placeholder. This is an implementation detail and could be + // changed in future. + // + // Why do we have Functions and Tensors and what is the relationship between + // them? Functions are used to represent several computations performed over + // the same domain. Tensors refer to individual computations of a Function. + // + // Also note that currently a lot of code only supports single-output + // Functions, in which case they become almost identical to Tensors. This + // probably will be changed in future. + + // TODO: Show how reductions are represented and constructed + } + + std::cout << "*** Loopnests and Statements ***" << std::endl; + { + // Creating a tensor expression is the first step to generate an executable + // code for it. A next step is to represent it as a loop nest and apply + // various loop transformations in order to get an optimal implementation. + // In Halide's or TVM's terms the first step was to define the algorithm of + // computation (what to compute?) and now we are getting to the schedule of + // the computation (how to compute?). + // + // Let's create a simple tensor expression and construct a loop nest for it. + Placeholder A("A", kFloat, {64, 32}); + Placeholder B("B", kFloat, {64, 32}); + Tensor* X = Compute( + "X", + {{64, "i"}, {32, "j"}}, + [&](const VarHandle& i, const VarHandle& j) { + return A.load(i, j) + B.load(i, j); + }); + Tensor* Y = Compute( + "Y", + {{64, "i"}, {32, "j"}}, + [&](const VarHandle& i, const VarHandle& j) { + return sigmoid(X->call(i, j)); + }); + std::cout << "Tensor computation X: " << *X + << "Tensor computation Y: " << *Y << std::endl; + // Prints: + // Tensor computation X: Tensor X(i[64], j[32]) = (A[i, j]) + (B[i, j]) + // Tensor computation Y: Tensor Y(i[64], j[32]) = sigmoid(X(i, j)) + + // Creating a loop nest is as quite simple, we just need to specify what are + // the output tensors in our computation and LoopNest object will + // automatically pull all tensor dependencies: + LoopNest loopnest({Y}); + + // An IR used in LoopNest is based on tensor statements, represented by + // `Stmt` class. Statements are used to specify the loop nest structure, and + // to take a sneak peek at them, let's print out what we got right after + // creating our LoopNest object: + std::cout << *loopnest.root_stmt() << std::endl; + // Prints: + // { + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // X[i, j] = (A[i, j]) + (B[i, j]); + // } + // } + // for (int i_1 = 0; i_1 < 64; i_1++) { + // for (int j_1 = 0; j_1 < 32; j_1++) { + // Y[i_1, j_1] = sigmoid(X(i_1, j_1)); + // } + // } + // } + + // To introduce statements let's first look at their three main types (in + // fact, there are more than 3 types, but the other types would be easy to + // understand once the overall structure is clear): + // 1) Block + // 2) For + // 3) Store + // + // A `Block` statement is simply a list of other statements. + // A `For` is a statement representing one axis of computation. It contains + // an index variable (Var), boundaries of the axis (start and end - both are + // `Expr`s), and a `Block` statement body. + // A `Store` represents an assignment to a tensor element. It contains a Buf + // representing the target tensor, a list of expressions for indices of the + // element, and the value to be stored, which is an arbitrary expression. + + // Once we've constructed the loop nest, we can apply various tranformations + // to it. To begin with, let's inline computation of X into computation of Y + // and see what happens to our statements. + loopnest.computeInline(loopnest.getLoopBodyFor(X)); + std::cout << *loopnest.root_stmt() << std::endl; + // Prints: + // { + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // Y[i, j] = sigmoid((A[i, j]) + (B[i, j])); + // } + // } + // } + // + // As you can see, the first two loops have disappeared and the expression + // for X[i,j] has been inserted into the Y[i,j] computation. + + // Loop transformations can be composed, so we can do something else with + // our loop nest now. Let's split the inner loop with a factor of 9, for + // instance. + std::vector loops = loopnest.getLoopStmtsFor(Y); + For* j_outer; + For* j_inner; + For* j_tail; + int split_factor = 9; + loopnest.splitWithTail( + loops[1], // loops[0] is the outer loop, loops[1] is inner + split_factor, + &j_outer, // These are handles that we would be using for + &j_inner, // further transformations + &j_tail); + std::cout << *loopnest.root_stmt() << std::endl; + // Prints: + // { + // for (int i = 0; i < 64; i++) { + // for (int j_outer = 0; j_outer < (32 - 0) / 9; j_outer++) { + // for (int j_inner = 0; j_inner < 9; j_inner++) { + // Y[i, j_outer * 9 + j_inner] = sigmoid((A[i, j_outer * 9 + ... + // } + // } + // for (int j_tail = 0; j_tail < (32 - 0) % 9; j_tail++) { + // Y[i, j_tail + ((32 - 0) / 9) * 9] = sigmoid((A[i, j_tail + ... + // } + // } + // } + + // TODO: List all available transformations + // TODO: Show how statements can be constructed manually + } + + std::cout << "*** Codegen ***" << std::endl; + { + // An ultimate goal of tensor expressions is to be provide a mechanism to + // execute a given computation in the fastest possible way. So far we've + // looked at how we could describe what computation we're interested in, but + // we haven't looked at how to actually execute it. So far all we've been + // dealing with was just symbols with no actual data associated, in this + // section we would look at how we can bridge that gap. + + // Let's start by constructing a simple computation for us to work with: + Placeholder A("A", kInt, {64, 32}); + Placeholder B("B", kInt, {64, 32}); + Tensor* X = Compute( + "X", + {{64, "i"}, {32, "j"}}, + [&](const VarHandle& i, const VarHandle& j) { + return A.load(i, j) + B.load(i, j); + }); + + // And let's lower it to a loop nest, as we did in the previous section: + LoopNest loopnest({X}); + std::cout << *loopnest.root_stmt() << std::endl; + // Prints: + // { + // for (int i = 0; i < 64; i++) { + // for (int j = 0; j < 32; j++) { + // X[i, j] = (A[i, j]) + (B[i, j]); + // } + // } + + // Now imagine that we have two actual tensors 64x32 that we want sum + // together, how do we pass those tensors to the computation and how do we + // carry it out? + // + // Codegen object is aimed at providing exactly that functionality. Codegen + // is an abstract class and concrete codegens are derived from it. + // Currently, we have three codegens: + // 1) Simple Evaluator, + // 2) LLVM Codegen for CPU, + // 3) CUDA Codegen. + // In this example we will be using Simple Evaluator, since it's available + // everywhere. + + // To create a codegen, we need to provide the statement - it specifies the + // computation we want to perform - and a list of placeholders and tensors + // used in the computation. The latter part is crucial since that's the only + // way the codegen could use to correlate symbols in the statement to actual + // data arrays that we will be passing when we will actually be performing + // the computation. + // + // Let's create a Simple IR Evaluator codegen for our computation: + SimpleIREvaluator ir_eval(loopnest.root_stmt(), {A, B, X}); + + // We are using the simplest codegen and in it almost no work is done at the + // construction step. Real codegens such as CUDA and LLVM perform + // compilation during that stage so that when we're about to run the + // computation everything is ready. + + // Let's now create some inputs and run our computation with them: + std::vector data_A(64 * 32, 3); // This will be the input A + std::vector data_B(64 * 32, 5); // This will be the input B + std::vector data_X(64 * 32, 0); // This will be used for the result + + // Now let's invoke our codegen to perform the computation on our data. We + // need to provide as many arguments as how many placeholders and tensors we + // passed at the codegen construction time. A position in these lists would + // define how real data arrays from the latter call (these arguments are + // referred to as 'CallArg's in our codebase) correspond to symbols + // (placeholders and tensors) used in the tensor expressions we constructed + // (these are referred to as 'BufferArg'). + // Thus, we will provide three arguments: data_A, data_B, and data_X. data_A + // contains data for the placeholder A, data_B - for the placeholder B, and + // data_X would be used for contents of tensor X. + ir_eval(data_A, data_B, data_X); + + // Let's print one of the elements from each array to verify that the + // computation did happen: + std::cout << "A[10] = " << data_A[10] << std::endl + << "B[10] = " << data_B[10] << std::endl + << "X[10] = A[10] + B[10] = " << data_X[10] << std::endl; + // Prints: + // A[10] = 3 + // B[10] = 5 + // X[10] = A[10] + B[10] = 8 + } + + // TODO: Show how TorchScript IR is translated to TE + return 0; +} diff --git a/test/cpp_api_parity/parity-tracker.md b/test/cpp_api_parity/parity-tracker.md index b7ec61a5a958..66931b6f9316 100644 --- a/test/cpp_api_parity/parity-tracker.md +++ b/test/cpp_api_parity/parity-tracker.md @@ -88,11 +88,11 @@ torch::nn::GRU|Yes|No torch::nn::RNNCell|Yes|No torch::nn::LSTMCell|Yes|No torch::nn::GRUCell|Yes|No -torch::nn::Transformer|No|No +torch::nn::Transformer|Yes|No torch::nn::TransformerEncoder|No|No torch::nn::TransformerDecoder|No|No -torch::nn::TransformerEncoderLayer|No|No -torch::nn::TransformerDecoderLayer|No|No +torch::nn::TransformerEncoderLayer|Yes|No +torch::nn::TransformerDecoderLayer|Yes|No torch::nn::Identity|Yes|No torch::nn::Linear|Yes|No torch::nn::Bilinear|Yes|No diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp index 188484cf9248..b4901cdbcf4d 100644 --- a/test/cpp_extensions/cpp_c10d_extension.cpp +++ b/test/cpp_extensions/cpp_c10d_extension.cpp @@ -63,7 +63,7 @@ std::shared_ptr ProcessGroupTest::allgather_base( std::shared_ptr ProcessGroupTest::barrier( const BarrierOptions& opts) { - throw std::runtime_error("ProcessGroupTest does not support barrier"); + return std::make_shared(); } std::shared_ptr ProcessGroupTest::gather( diff --git a/test/custom_backend/backend.py b/test/custom_backend/backend.py index 17e399d320a7..8b48ed0a4108 100644 --- a/test/custom_backend/backend.py +++ b/test/custom_backend/backend.py @@ -33,7 +33,7 @@ def to_custom_backend(module): Returns: The module, lowered so that it can run on TestBackend. """ - lowered_module = torch._C._jit_to_backend("custom_backend", module._c, {"forward": {"": ""}}) + lowered_module = torch._C._jit_to_backend("custom_backend", module, {"forward": {"": ""}}) return lowered_module diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py index 2b3d43814c0f..37c8f14af853 100644 --- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py +++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py @@ -14,6 +14,7 @@ MultiProcessTestCase, requires_nccl, skip_if_lt_x_gpu, + skip_if_rocm, ) from torch.testing._internal.common_utils import run_tests @@ -97,6 +98,7 @@ def _run_and_get_grads(self, model): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_allreduce_hook(self): """ This unit test verifies the ``allreduce`` hook registered case gives same result @@ -114,6 +116,7 @@ def test_ddp_comm_hook_allreduce_hook(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_fp16compress_hook(self): """ This unit test verifies the ``fp16 compress`` hook registered case @@ -131,6 +134,7 @@ def test_ddp_comm_hook_fp16compress_hook(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_quantize_per_tensor_hook(self): """ This unit test verifies the ``quantize per tensor`` hook registered case @@ -148,6 +152,7 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_quantize_per_channel_hook(self): """ This unit test verifies the ``quantize per channel`` hook registered case diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index cfd0930284a5..9d0c19bef7b3 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -29,7 +29,8 @@ from torch.testing._internal.common_distributed import MultiProcessTestCase, \ requires_gloo, requires_nccl, requires_nccl_version, \ skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \ - simple_sparse_reduce_tests + skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \ + create_device from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \ retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN @@ -255,6 +256,7 @@ def create_tcp_store(addr): raise RuntimeError("Unable to find free port (tried %s)" % ", ".join(ports)) +@skip_if_win32() class TCPStoreTest(TestCase, StoreTestBase): def _create_store(self): store = create_tcp_store('localhost') @@ -272,7 +274,32 @@ def test_address_already_in_use(self): store1 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 store2 = c10d.TCPStore(addr, port, 1, True) # noqa: F841 + def _test_numkeys_delkeys(self, fs): + # We start off with one init key in the store to coordinate workers + self.assertEqual(fs.num_keys(), 1) + fs.add("key", 1) + fs.add("key", 2) + fs.add("key", 3) + fs.set("key0", "value0") + fs.add("key3", 1) + fs.set("key1", "value1") + self.assertEqual(fs.num_keys(), 5) + fs.delete_key("key") + self.assertEqual(fs.num_keys(), 4) + with self.assertRaises(RuntimeError): + fs.get("key") + fs.delete_key("key0") + fs.delete_key("key3") + self.assertEqual(fs.num_keys(), 2) + fs.set("key4", "value2") + self.assertEqual(fs.num_keys(), 3) + self.assertEqual(b"value1", fs.get("key1")) + self.assertEqual(b"value2", fs.get("key4")) + + def test_numkeys_delkeys(self): + self._test_numkeys_delkeys(self._create_store()) +@skip_if_win32() class PrefixTCPStoreTest(TestCase, StoreTestBase): def setUp(self): super(PrefixTCPStoreTest, self).setUp() @@ -329,13 +356,14 @@ def test_unknown_handler(self): c10d.rendezvous('invalid://') +@skip_if_win32() class RendezvousEnvTest(TestCase): @retry_on_connect_failures + @requires_nccl() def test_common_errors(self): - # TODO remove this hack - if not hasattr(c10d, "ProcessGroupNCCL"): - raise unittest.SkipTest("C10D is not built with NCCL process group," - " skipping test") + if torch.cuda.device_count() == 0: + raise unittest.SkipTest("No GPUs available, skipping test") + vars = { "WORLD_SIZE": "1", "RANK": "0", @@ -455,7 +483,7 @@ def test_common_errors(self): def test_nominal(self): with tempfile.NamedTemporaryFile(delete=False) as file: - url = 'file://%s?world_size=%d' % (file.name, 2) + url = f'file:///{file.name.replace(os.path.sep, "/")}?world_size=2' gen0 = c10d.rendezvous(url + "&rank=0") store0, rank0, size0 = next(gen0) self.assertEqual(0, rank0) @@ -474,6 +502,7 @@ def test_nominal(self): self.assertEqual(b"value1", store0.get("key1")) +@skip_if_win32() class RendezvousTCPTest(TestCase): def create_tcp_url(self): @@ -544,9 +573,13 @@ def _test_store_timeout(self, backend, init_method, c2p): def _init_methods(self): f = tempfile.NamedTemporaryFile(delete=False) - yield "file://%s" % f.name - f.close() - yield "tcp://127.0.0.1:%d" % common.find_free_port() + if sys.platform == 'win32': + yield "file:///%s" % f.name.replace("\\", "/") + f.close() + else: + yield "file://%s" % f.name + f.close() + yield "tcp://127.0.0.1:%d" % common.find_free_port() def _test_default_store_timeout(self, backend): for init_method in self._init_methods(): @@ -571,6 +604,8 @@ def _test_default_store_timeout(self, backend): @requires_nccl() @retry_on_connect_failures def test_default_store_timeout_nccl(self): + if torch.cuda.device_count() == 0: + raise unittest.SkipTest("No GPUs available, skipping test") self._test_default_store_timeout('nccl') @requires_gloo() @@ -584,11 +619,16 @@ def test_default_store_timeout_gloo(self): class ProcessGroupGlooTest(MultiProcessTestCase): def setUp(self): super(ProcessGroupGlooTest, self).setUp() - self._fork_processes() + + # For Windows platform, Python does not support fork, change it to spawn here. + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def opts(self, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + opts.devices = [create_device(interface=LOOPBACK)] opts.timeout = 5.0 opts.threads = threads return opts @@ -598,8 +638,8 @@ def test_multi_device_constructor(self): opts = c10d.ProcessGroupGloo.Options() opts.timeout = 5.0 opts.devices = [ - c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), - c10d.ProcessGroupGloo.create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), + create_device(interface=LOOPBACK), ] pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, opts) @@ -1514,6 +1554,7 @@ def test_barrier_implies_wait(self): for i, tensor in enumerate(tensors): self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) + @skip_if_win32() def test_round_robin(self): num_process_groups = 2 store = c10d.FileStore(self.file_name, self.world_size) @@ -1531,6 +1572,7 @@ def test_round_robin(self): pg.broadcast(tensor, root=0).wait() self.assertEqual(torch.full([100, 100], 0.), tensor) + @skip_if_win32() def test_round_robin_create_destroy(self): store = c10d.FileStore(self.file_name, self.world_size) @@ -1553,12 +1595,30 @@ def create(num, prefix): self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor) del pg +class ProcessGroupNCCLNoGPUTest(TestCase): + MAIN_PROCESS_RANK = 0 + + def setUp(self): + self.rank = self.MAIN_PROCESS_RANK + self.world_size = 1 + self.file = tempfile.NamedTemporaryFile(delete=False) + self.num_gpus = torch.cuda.device_count() + if self.num_gpus > 0: + raise unittest.SkipTest("GPUs are available, skipping test") + + def tearDown(self): + pass + + @requires_nccl() + @skip_if_rocm_single_process + def test_init_no_gpus(self): + store = c10d.FileStore(self.file.name, self.world_size) + with self.assertRaisesRegex( + RuntimeError, + "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"): + c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + -@requires_nccl() -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) class ProcessGroupNCCLTest(TestCase): MAIN_PROCESS_RANK = 0 @@ -1573,6 +1633,8 @@ def setUp(self): def tearDown(self): pass + @requires_nccl() + @skip_if_rocm_single_process def test_empty_tensors(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1597,6 +1659,8 @@ def test_empty_tensors(self): pg.reduce_scatter(ys, xs).wait() self.assertEqual(0, ys[0].numel()) + @requires_nccl() + @skip_if_rocm_single_process def test_broadcast_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1619,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor): for i in range(self.num_gpus): self.assertEqual(tensors[i], tensors[rt]) + @requires_nccl() + @skip_if_rocm_single_process def test_allreduce_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1680,6 +1746,8 @@ def allreduce(tensors, op): with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"): allreduce(tensors, op) + @requires_nccl() + @skip_if_rocm_single_process def test_reduce_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1710,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None): with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"): reduce(tensors, self.rank, rt, op) + @requires_nccl() + @skip_if_rocm_single_process def test_allgather_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1735,6 +1805,8 @@ def allgather(output_ts, input_ts): for s_idx, t in enumerate(device_ts): self.assertEqual(torch.tensor([s_idx]), t) + @requires_nccl() + @skip_if_rocm_single_process def test_reduce_scatter_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1812,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op): # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 self.assertEqualIgnoreType(expected, output[i]) + @requires_nccl() + @skip_if_rocm_single_process def test_barrier(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1936,7 +2010,7 @@ def forward(self, x): return self.p + x -class TestDdpCommHook(nn.Module): +class ModuleForDdpCommHook(nn.Module): def __init__(self): super().__init__() self.t0 = Task() @@ -1958,7 +2032,10 @@ def forward(self, x): class DistributedDataParallelTest(MultiProcessTestCase): def setUp(self): super(DistributedDataParallelTest, self).setUp() - self._fork_processes() + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def tearDown(self): # DistributedDataParallel test doesn't seem to call FileStore destructor @@ -1973,13 +2050,15 @@ def tearDown(self): def world_size(self): return 2 - def _prepare_single_device_module(self, process_group, devices, device_ids, global_batch_size): + def _prepare_single_device_module( + self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False): model = Net() ddp_model = DistributedDataParallel( copy.deepcopy(model).to(devices[0]), device_ids=device_ids, process_group=process_group, - bucket_cap_mb=0.001) + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view) model.to(devices[0]) @@ -1988,7 +2067,7 @@ def _prepare_single_device_module(self, process_group, devices, device_ids, glob return model, ddp_model, input, target - def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size): + def _prepare_multi_device_module(self, process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view=False): self.assertTrue( len(devices) == 2 or len(devices) == 4, "unexpected devices for ddp tests {}".format(devices)) @@ -2001,14 +2080,15 @@ def _prepare_multi_device_module(self, process_group, devices, device_ids, globa copy.deepcopy(model), device_ids=device_ids, process_group=process_group, - bucket_cap_mb=0.001) + bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view) input = torch.randn(global_batch_size, 2).cuda(devices[0]) target = torch.randn(global_batch_size, 4) return model, ddp_model, input, target - def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False): + def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): """ Note: we pass down `device_ids` all the way to DistributedDataParallel as part of the test. Below you find tests that either use a list of @@ -2022,11 +2102,11 @@ def _test_ddp_with_process_group(self, process_group, devices, device_ids, multi if multi_device: model, ddp_model, input, target = \ self._prepare_multi_device_module( - process_group, devices, device_ids, global_batch_size) + process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view) else: model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, devices, device_ids, global_batch_size) + process_group, devices, device_ids, global_batch_size, gradient_as_bucket_view) def step_model(model, input, target): model.train() @@ -2061,17 +2141,21 @@ def update_parameters(model): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] - def _test_gloo_backend(self, devices, device_ids, multi_device=False): + def _test_gloo_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) - self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @requires_gloo() def test_gloo_backend_cpu_module(self): self._test_gloo_backend([torch.device("cpu")], []) + @requires_gloo() + def test_gloo_backend_cpu_module_grad_is_view(self): + self._test_gloo_backend([torch.device("cpu")], [], gradient_as_bucket_view=True) + @requires_gloo() @skip_if_not_multigpu def test_gloo_backend_1gpu_module_device_ids_integer_list(self): @@ -2088,6 +2172,7 @@ def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self): @requires_gloo() @skip_if_lt_x_gpu(4) + @skip_if_rocm def test_gloo_backend_2gpu_module(self): int_devices = gpus_for_rank(self.world_size)[self.rank][:2] devices = [torch.device("cuda:" + str(i)) for i in int_devices] @@ -2100,10 +2185,10 @@ def test_gloo_backend_4gpu_module(self): devices = [torch.device("cuda:" + str(i)) for i in int_devices] self._test_gloo_backend(devices, [], multi_device=True) - def _test_nccl_backend(self, devices, device_ids, multi_device=False): + def _test_nccl_backend(self, devices, device_ids, multi_device=False, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) - self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device) + self._test_ddp_with_process_group(process_group, devices, device_ids, multi_device, gradient_as_bucket_view) @requires_nccl() @skip_if_not_multigpu @@ -2123,6 +2208,7 @@ def test_nccl_backend_1gpu_module_device_ids_torch_device_list(self): @requires_nccl() @skip_if_lt_x_gpu(4) + @skip_if_rocm def test_nccl_backend_2gpu_module(self): int_devices = gpus_for_rank(self.world_size)[self.rank][:2] devices = [torch.device("cuda:" + str(i)) for i in int_devices] @@ -2130,6 +2216,7 @@ def test_nccl_backend_2gpu_module(self): @requires_nccl() @skip_if_lt_x_gpu(8) + @skip_if_rocm def test_nccl_backend_4gpu_module(self): int_devices = gpus_for_rank(self.world_size)[self.rank][:4] devices = [torch.device("cuda:" + str(i)) for i in int_devices] @@ -2137,6 +2224,7 @@ def test_nccl_backend_4gpu_module(self): @requires_nccl() @skip_if_lt_x_gpu(4) + @skip_if_rocm def test_ddp_multi_device_module_config(self): gpus = gpus_for_rank(self.world_size)[self.rank] @@ -2165,9 +2253,7 @@ def test_ddp_multi_device_module_config(self): ddp_model = DistributedDataParallel( model, device_ids=gpus, process_group=process_group) - @requires_nccl() - @skip_if_not_multigpu - def test_fp16(self): + def _test_fp16(self, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -2179,6 +2265,7 @@ def test_fp16(self): device_ids=[gpus[0]], process_group=process_group, bucket_cap_mb=0.001, + gradient_as_bucket_view=gradient_as_bucket_view ) # Input 2**15, so that the gradients will overflow with a @@ -2198,7 +2285,17 @@ def test_fp16(self): @requires_nccl() @skip_if_not_multigpu - def test_arbitrary_forward_return_value(self): + @skip_if_rocm + def test_fp16(self): + self._test_fp16() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_fp16_grad_is_view(self): + self._test_fp16(gradient_as_bucket_view=True) + + def _test_arbitrary_forward_return_value(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2234,6 +2331,7 @@ def forward(self, x, fn): ForwardReturnValueModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) batch_size = 4 @@ -2289,7 +2387,16 @@ def test(box, unbox): @requires_nccl() @skip_if_not_multigpu @skip_if_rocm - def test_find_unused_parameters_kwarg(self): + def test_arbitrary_forward_return_value(self): + self._test_arbitrary_forward_return_value() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_arbitrary_forward_return_value_grad_is_view(self): + self._test_arbitrary_forward_return_value(gradient_as_bucket_view=True) + + def _test_find_unused_parameters_kwarg(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2319,12 +2426,13 @@ def forward(self, x): input = torch.rand([batch_size, 2], dtype=torch.float) target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id) - def test_find_unused_parameters(find_unused_parameters, test_default=False): + def test_find_unused_parameters(find_unused_parameters, test_default=False, gradient_as_bucket_view=False): if test_default: model = DistributedDataParallel( FindUnusedParametersModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) else: model = DistributedDataParallel( @@ -2332,6 +2440,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): device_ids=[device_id], process_group=process_group, find_unused_parameters=find_unused_parameters, + gradient_as_bucket_view=gradient_as_bucket_view, ) output, fc3 = model(input) @@ -2343,7 +2452,7 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): # trigger an error when `backward` is called (because fc3 is an unused # parameter and will therefore be marked ready twice). try: - test_find_unused_parameters(True) + test_find_unused_parameters(True, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.assertTrue( str(ex).startswith("Expected to mark a variable ready only once.")) @@ -2353,19 +2462,29 @@ def test_find_unused_parameters(find_unused_parameters, test_default=False): # Then test that the default behavior can be overridden by setting # `find_unused_parameters=False`. try: - test_find_unused_parameters(False) + test_find_unused_parameters(False, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.fail("Unexpected exception: %s" % ex) # Test find_unused_parameters defaults to False try: - test_find_unused_parameters(True, test_default=True) + test_find_unused_parameters(True, test_default=True, gradient_as_bucket_view=gradient_as_bucket_view) except Exception as ex: self.fail("Unexpected exception: %s" % ex) - @requires_gloo() - @skip_if_lt_x_gpu(2) - def test_global_local_unused_params_grad(self): + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_find_unused_parameters_kwarg(self): + self._test_find_unused_parameters_kwarg() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_find_unused_parameters_kwarg_grad_is_view(self): + self._test_find_unused_parameters_kwarg(gradient_as_bucket_view=True) + + def _test_global_local_unused_params_grad(self, gradient_as_bucket_view=False): """ By simulating a multi-task training, this test is to make sure: 1) DDP does not touch the grad of globally unused parameters. @@ -2411,6 +2530,7 @@ def run_and_verify_grad(model): GlobalLocalUnusedParamModule().cpu(), process_group=process_group, find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, ) run_and_verify_grad(cpu_model) @@ -2421,9 +2541,20 @@ def run_and_verify_grad(model): device_ids=[device_id], process_group=process_group, find_unused_parameters=True, + gradient_as_bucket_view=gradient_as_bucket_view, ) run_and_verify_grad(gpu_model) + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad(self): + self._test_global_local_unused_params_grad() + + @requires_gloo() + @skip_if_lt_x_gpu(2) + def test_global_local_unused_params_grad_with_grad_is_view(self): + self._test_global_local_unused_params_grad(gradient_as_bucket_view=True) + @requires_gloo() @skip_if_lt_x_gpu(2) def test_find_unused_parameters_when_unused_parameters_empty(self): @@ -2480,9 +2611,7 @@ def run_and_verify_grad(model): ) run_and_verify_grad(gpu_model) - @requires_nccl() - @skip_if_not_multigpu - def test_multiple_outputs_multiple_backward(self): + def _test_multiple_outputs_multiple_backward(self, gradient_as_bucket_view=False): """ Note: this test can be sped up by only running it on a CPU module once DistributedDataParallel supports them. @@ -2516,6 +2645,7 @@ def forward(self, x): MultipleOutputModule().float().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) batch_size = 4 @@ -2532,6 +2662,19 @@ def forward(self, x): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm + def test_multiple_outputs_multiple_backward(self): + self._test_multiple_outputs_multiple_backward() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_multiple_outputs_multiple_backward_grad_is_view(self): + self._test_multiple_outputs_multiple_backward(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm def test_no_grad(self): """ Note: this test can be sped up by only running it on a CPU module @@ -2578,7 +2721,7 @@ def check_no_grads(): # No parameter should have their gradient set. check_no_grads() - def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None): + def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None, gradient_as_bucket_view=False): """ This is the recommended way to implement accumulate grads. If ``ddp_comm_hook`` input was specified, it will also register that hook @@ -2593,7 +2736,7 @@ def _test_accumulate_gradients_no_sync(self, num_iters=2, ddp_comm_hook=None): local_batch_size = len(devices) model, ddp_model, input, target = self._prepare_single_device_module( - process_group, devices, devices, global_batch_size + process_group, devices, devices, global_batch_size, gradient_as_bucket_view ) if ddp_comm_hook is not None: @@ -2643,6 +2786,7 @@ def step_model(model, input, target): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_no_sync(self): """ Runs _test_accumulate_gradients_no_sync using default inputs @@ -2651,6 +2795,16 @@ def test_accumulate_gradients_no_sync(self): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_no_sync_grad_is_view(self): + """ + Runs _test_accumulate_gradients_no_sync using default inputs + """ + self._test_accumulate_gradients_no_sync(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_no_sync_allreduce_hook(self): """ Runs multiple iterations on _test_accumulate_gradients_no_sync @@ -2670,6 +2824,7 @@ def allreduce_hook( @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self): """ Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce @@ -2697,9 +2852,7 @@ def div(fut): num_iters=4, ddp_comm_hook=allreduce_with_then_hook ) - @requires_nccl() - @skip_if_not_multigpu - def test_accumulate_gradients_module(self): + def _test_accumulate_gradients_module(self, gradient_as_bucket_view=False): # This is NOT the recommended way to implement accumulating grads, but # we would like to make sure DDP does not mess up with the underlying # module. @@ -2711,7 +2864,7 @@ def test_accumulate_gradients_module(self): model, ddp_model, input, target = \ self._prepare_single_device_module( - process_group, devices, devices, global_batch_size) + process_group, devices, devices, global_batch_size, gradient_as_bucket_view) def step_model(model, input, target): model.train() @@ -2751,6 +2904,18 @@ def step_model(model, input, target): torch.manual_seed(1337 + iteration) input = input[torch.randperm(global_batch_size)] + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_module(self): + self._test_accumulate_gradients_module() + + @requires_nccl() + @skip_if_not_multigpu + @skip_if_rocm + def test_accumulate_gradients_module_with_grad_is_view(self): + self._test_accumulate_gradients_module(gradient_as_bucket_view=True) + @requires_gloo() def test_ignored_output(self): """ @@ -2840,6 +3005,7 @@ def forward(self, x): @requires_nccl() @skip_if_not_multigpu + @skip_if_rocm def test_failure_recovery(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -2903,6 +3069,92 @@ def forward(self, x): loss = criterion(output, target) loss.backward() + @requires_nccl() + @skip_if_not_multigpu + def test_save_load_checkpoint(self): + dist.init_process_group( + "gloo", + init_method=f"file://{self.file_name}", + world_size=self.world_size, + rank=self.rank + ) + + class TestModel(nn.Module): + def __init__(self): + super(TestModel, self).__init__() + self.fc1 = nn.Linear(2, 10, bias=False) + self.fc2 = nn.Linear(10, 4, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + return F.softmax(x, dim=1) + + def train_loop(model, optimizer, iterations): + for _ in range(iterations): + optimizer.zero_grad() + output = model(input) + loss = criterion(output, target) + loss.backward() + optimizer.step() + + device_id = gpus_for_rank(self.world_size)[self.rank][0] + + model_withload = TestModel().float().to(device_id) + model_withoutload = TestModel().float().to(device_id) + + ddp_withload = DistributedDataParallel( + model_withload, + device_ids=[device_id], + ) + ddp_withoutload = DistributedDataParallel( + model_withoutload, + device_ids=[device_id], + ) + + # ensure that both models start with the same set of parameters. By default they are randomized on construction + for p in ddp_withload.parameters(): + with torch.no_grad(): + p.zero_() + for p in ddp_withoutload.parameters(): + with torch.no_grad(): + p.zero_() + + batch_size = 4 + criterion = nn.CrossEntropyLoss() + + optimizer_withload = torch.optim.SGD(ddp_withload.parameters(), lr=0.001) + optimizer_withoutload = torch.optim.SGD(ddp_withoutload.parameters(), lr=0.001) + + input = torch.rand([batch_size, 2], dtype=torch.float) + target = torch.LongTensor([random.randrange(4) for _ in range(batch_size)]).to(device_id) + + # run the model for 6 iterations, with a checkpoint in the middle + train_loop(ddp_withload, optimizer_withload, 3) + + # zero out parameters and reload them from the state dict + checkpoint_path = tempfile.gettempdir() + "/model.checkpoint" + if self.rank == 0: + torch.save(ddp_withload.state_dict(), checkpoint_path) + + dist.barrier() + for p in ddp_withload.parameters(): + with torch.no_grad(): + p.zero_() + map_location = {'cuda:%d' % 0: 'cuda:%d' % self.rank} + ddp_withload.load_state_dict( + torch.load(checkpoint_path, map_location=map_location)) + + train_loop(ddp_withload, optimizer_withload, 3) + + # re-run the model with the same inputs for 6 iterations with no checkpoint + train_loop(ddp_withoutload, optimizer_withoutload, 6) + + for p_withload, p_withoutload in zip(ddp_withload.parameters(), ddp_withoutload.parameters()): + self.assertEqual(p_withload, p_withoutload) + + def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): mult = 2 batch_size = mult * self.world_size @@ -2923,8 +3175,7 @@ def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): ddp_parameter = next(ddp_model.parameters()) self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) - @requires_gloo() - def test_sparse_gradients(self): + def _test_sparse_gradients(self, gradient_as_bucket_view=False): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) @@ -2935,10 +3186,19 @@ def test_sparse_gradients(self): ddp_model = DistributedDataParallel( copy.deepcopy(vanilla_model), process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) + @requires_gloo() + def test_sparse_gradients(self): + self._test_sparse_gradients() + + @requires_gloo() + def test_sparse_gradients_grad_is_view(self): + self._test_sparse_gradients(gradient_as_bucket_view=True) + def _test_grad_layout(self, replica_devices, layer_devs, local_batch_size): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -3097,7 +3357,7 @@ def test_ddp_comm_hook_future_passing_cpu(self): # Test on CPU cpu_model = DistributedDataParallel( - TestDdpCommHook().cpu(), process_group=process_group + ModuleForDdpCommHook().cpu(), process_group=process_group ) # Register DDP Communication Hook @@ -3107,12 +3367,13 @@ def test_ddp_comm_hook_future_passing_cpu(self): # without the comm_hook, result would be 0.25 * torch.ones(2, 2). self._run_and_verify_hook(cpu_model, 8, 2 * torch.ones(2, 2)) - def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None): + def _gpu_model_with_ddp_comm_hook(self, process_group, hook=None, gradient_as_bucket_view=False): device_id = gpus_for_rank(self.world_size)[self.rank][0] gpu_model = DistributedDataParallel( - TestDdpCommHook().to(device_id), + ModuleForDdpCommHook().to(device_id), device_ids=[device_id], process_group=process_group, + gradient_as_bucket_view=gradient_as_bucket_view, ) # Register DDP Communication Hook if defined @@ -3161,6 +3422,7 @@ def test_ddp_comm_hook_future_passing_gpu_gloo(self): @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_future_passing_gpu_nccl(self): """ This unit test verifies whether the Future object is passed properly using nccl backend. @@ -3176,9 +3438,7 @@ def test_ddp_comm_hook_future_passing_gpu_nccl(self): # without the comm_hook, result would be 0.25 * torch.ones(2, 2). self._run_and_verify_hook(gpu_model, 8, 2 * torch.ones(2, 2)) - @requires_nccl() - @skip_if_lt_x_gpu(2) - def test_ddp_comm_hook_allreduce_hook_nccl(self): + def _test_ddp_comm_hook_allreduce_hook_nccl(self, gradient_as_bucket_view=False): """ This unit test verifies whether a DDP communication hook that just calls allreduce gives the same result result with the case of no hook registered. @@ -3193,13 +3453,26 @@ def allreduce_hook(state: object, bucket: dist._GradBucket) -> torch._C.Future: return process_group.allreduce(tensors).get_future() # Get GPU model with allreduce_hook registered. - gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook) + gpu_model = self._gpu_model_with_ddp_comm_hook(process_group, allreduce_hook, gradient_as_bucket_view) # check whether the grads are equal to what DDP without hook would return. self._run_and_verify_hook(gpu_model, 8, 0.25 * torch.ones(2, 2)) @requires_nccl() @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_comm_hook_allreduce_hook_nccl(self): + self._test_ddp_comm_hook_allreduce_hook_nccl() + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_comm_hook_allreduce_hook_nccl_grad_is_view(self): + self._test_ddp_comm_hook_allreduce_hook_nccl(gradient_as_bucket_view=True) + + @requires_nccl() + @skip_if_lt_x_gpu(2) + @skip_if_rocm def test_ddp_comm_hook_allreduce_with_then_hook_nccl(self): """ This unit test verifies whether a DDP communication hook that calls allreduce and then @@ -3243,7 +3516,7 @@ def test_ddp_invalid_comm_hook_init(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group) + model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group) with self.assertRaisesRegex(TypeError, "Communication hook must be callable."): model._register_comm_hook(state=None, hook=1) @@ -3267,7 +3540,7 @@ def test_ddp_invalid_comm_hook_return_type(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group) + model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group) with self.assertRaisesRegex( ValueError, @@ -3304,7 +3577,7 @@ def test_ddp_comm_hook_register_just_once(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) - model = DistributedDataParallel(TestDdpCommHook(), process_group=process_group) + model = DistributedDataParallel(ModuleForDdpCommHook(), process_group=process_group) def dummy_hook(state, bucket): fut = torch.futures.Future() @@ -3591,6 +3864,7 @@ def _run_all_reduce(self, pg): @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_nonblocking(self): store = c10d.FileStore(self.file_name, self.world_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -3642,36 +3916,42 @@ def _test_nccl_errors_blocking(self, func): @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_clean_exit(self): self._test_nccl_errors_blocking(lambda: sys.exit(0)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_nonzero_exit(self): self._test_nccl_errors_blocking(lambda: sys.exit(1)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_abort(self): self._test_nccl_errors_blocking(lambda: os.abort()) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_sigkill(self): self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGKILL)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_errors_blocking_sigterm(self): self._test_nccl_errors_blocking(lambda: os.kill(os.getpid(), signal.SIGTERM)) @requires_nccl() @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking") @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_nccl_blocking_wait_with_barrier(self): os.environ["NCCL_BLOCKING_WAIT"] = "1" store = c10d.FileStore(self.file_name, self.world_size) @@ -3694,6 +3974,7 @@ def _run_invalid_nccl_blocking_wait_env(self, val): @requires_nccl() @skip_if_lt_x_gpu(3) + @skip_if_rocm def test_invalid_nccl_blocking_wait_env(self): self._run_invalid_nccl_blocking_wait_env('abc') self._run_invalid_nccl_blocking_wait_env('-1') @@ -3743,7 +4024,10 @@ def test_nccl_timeout(self): class CommTest(MultiProcessTestCase): def setUp(self): super(CommTest, self).setUp() - self._fork_processes() + if sys.platform == 'win32': + self._spawn_processes() + else: + self._fork_processes() def tearDown(self): super(CommTest, self).tearDown() @@ -3809,7 +4093,7 @@ def test_broadcast_coalesced_nccl(self): def test_broadcast_coalesced_gloo_cuda(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cuda:%d" % self.rank) ranks = list(range(self.world_size)) @@ -3820,7 +4104,7 @@ def test_broadcast_coalesced_gloo_cuda(self): def test_broadcast_coalesced_gloo_cpu(self): store = c10d.FileStore(self.file_name, self.world_size) options = c10d.ProcessGroupGloo.Options() - options.devices = [c10d.ProcessGroupGloo.create_device(interface=LOOPBACK)] + options.devices = [create_device(interface=LOOPBACK)] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device("cpu") ranks = list(range(self.world_size)) diff --git a/test/distributed/test_c10d_spawn.py b/test/distributed/test_c10d_spawn.py index d0bf00b8a08a..c84608e8f178 100644 --- a/test/distributed/test_c10d_spawn.py +++ b/test/distributed/test_c10d_spawn.py @@ -10,8 +10,10 @@ import torch.nn as nn from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU -from torch.testing._internal.common_distributed import requires_gloo -from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, skipIfRocm +from torch.testing._internal.common_distributed import requires_gloo, \ + create_device +from torch.testing._internal.common_utils import TestCase, load_tests, \ + run_tests, skipIfRocm from torch.testing._internal.common_utils import NO_MULTIPROCESSING_SPAWN, TEST_WITH_TSAN @@ -39,7 +41,7 @@ class ProcessGroupShareTensorTest(TestCase): @classmethod def opts(cls, threads=2): opts = c10d.ProcessGroupGloo.Options() - opts.devices = [c10d.ProcessGroupGloo.create_device(interface="lo")] + opts.devices = [create_device(interface='lo')] opts.timeout = 5.0 opts.threads = threads return opts diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py index dee5fd702b16..99a10906462a 100644 --- a/test/distributed/test_data_parallel.py +++ b/test/distributed/test_data_parallel.py @@ -775,6 +775,36 @@ def forward(self, x): print("Caught exception during iterations at " + named_msg, flush=True) raise + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + def test_parameter_list_dict_replica(self): + class MyMod(torch.nn.Module): + def __init__(self, data): + super(MyMod, self).__init__() + self.data = data + + def forward(self, inp): + return inp + + p1 = torch.nn.Parameter(torch.rand(10)) + p2 = torch.nn.Parameter(torch.rand(10)) + module = MyMod(torch.nn.ParameterList([p1, p2])).cuda() + model = dp.DataParallel(module) + input = torch.randn((8, 8), device="cuda") + + with self.assertWarnsRegex( + UserWarning, + r"nn\.ParameterList is being used with DataParallel but this"): + model(input) + + module = MyMod(torch.nn.ParameterDict({"0": p1, "1": p2})).cuda() + model = dp.DataParallel(module) + input = torch.randn((8, 8), device="cuda") + + with self.assertWarnsRegex( + UserWarning, + r"nn\.ParameterDict is being used with DataParallel but this"): + model(input) + if __name__ == '__main__': run_tests() diff --git a/test/fx/quantization.py b/test/fx/quantization.py index 968c797c9163..a2de582937aa 100644 --- a/test/fx/quantization.py +++ b/test/fx/quantization.py @@ -219,6 +219,7 @@ def observe(self, args): def load_arg(a): return map_arg(a, lambda node: env[node.name]) + output_node : Optional[Node] = None for node in self.graph.nodes: if node.op == 'placeholder': result = next(args_iter) @@ -232,6 +233,8 @@ def load_arg(a): result = getattr(self_obj, node.target)(*args, **kwargs) elif node.op == 'call_module': result = self.modules[node.target](*load_arg(node.args), **load_arg(node.kwargs)) + elif node.op == 'output': + return load_arg(node.args[0]) env[node.name] = result root_node, obj = self.matches.get(node.name, (None, None)) @@ -240,7 +243,7 @@ def load_arg(a): if node.name in self.quants: self.quants[node.name].observe(node, env) - return load_arg(self.graph.result) + raise RuntimeError('Graph had no output node!') def quantize(self): self.quantized_graph = Graph() @@ -281,7 +284,6 @@ def load_or_emit(n): else: quant_env[node.name] = r - self.quantized_graph.output(load_arg(self.graph.result, quantized=False)) return GraphModule(self.root, self.quantized_graph) def _find_matches(self, patterns): diff --git a/test/jit/test_backends.py b/test/jit/test_backends.py index e2eaa0b2a1e5..89330ddbd2d9 100644 --- a/test/jit/test_backends.py +++ b/test/jit/test_backends.py @@ -6,8 +6,14 @@ import torch import torch._C from pathlib import Path -from torch.testing._internal.common_utils import TEST_WITH_ROCM, skipIfRocm, IS_SANDCASTLE, IS_WINDOWS, IS_MACOS - +from torch.testing._internal.common_utils import ( + IS_FBCODE, + IS_MACOS, + IS_SANDCASTLE, + IS_WINDOWS, + TEST_WITH_ROCM, + skipIfRocm, +) # Make the helper files in test/ importable pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(pytorch_test_dir) @@ -54,7 +60,7 @@ class JitBackendTestCase(JitTestCase): def setUp(self): super().setUp() - if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS: + if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE: raise unittest.SkipTest("non-portable load_library call used in test") torch_root = Path(__file__).resolve().parent.parent.parent p = torch_root / 'build' / 'lib' / 'libjitbackend_test.so' @@ -101,7 +107,7 @@ def setUp(self): self.module = BasicModule() self.scripted_module = torch.jit.script(BasicModule()) self.lowered_module = to_test_backend_multi( - self.scripted_module._c, + self.scripted_module, {"accum": {"": ""}, "sub_accum": {"": ""}, "forward": {"": ""}}, ) @@ -161,7 +167,7 @@ def setUp(self): # Both modules in self.scripted_module are ScriptModules. self.scripted_module = torch.jit.script(NestedModuleTest.NestedModule(BasicModule())) lowered_module = to_test_backend_multi( - self.scripted_module._c, {"forward": {"": ""}} + self.scripted_module, {"forward": {"": ""}} ) # self.lowered_module is a ScriptModule, but its submodule is a lowered module. self.lowered_module = torch.jit.script(NestedModuleTest.NestedModule(lowered_module)) diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py index c71be6ac1d9f..7c9e323163e6 100644 --- a/test/jit/test_class_type.py +++ b/test/jit/test_class_type.py @@ -13,7 +13,7 @@ from torch.testing._internal.jit_utils import JitTestCase import torch.testing._internal.jit_utils from torch.testing._internal.common_utils import IS_SANDCASTLE -from typing import List, Tuple, Iterable +from typing import List, Tuple, Iterable, Optional, Dict if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" @@ -1020,6 +1020,106 @@ def foo(): y.my_list = new_list return y + def test_default_args(self): + """ + Test that methods on class types can have default arguments. + """ + @torch.jit.script + class ClassWithDefaultArgs: + def __init__( + self, + a: int = 1, + b: Optional[List[int]] = None, + c: Tuple[int, int, int] = (1, 2, 3), + d: Optional[Dict[int, int]] = None, + e: Optional[str] = None, + ): + self.int = a + self.tup = c + self.str = e + + self.list = [1, 2, 3] + if b is not None: + self.list = b + + self.dict = {1: 2, 3: 4} + if d is not None: + self.dict = d + + def add(self, b: int, scale: float = 1.0) -> float: + return self.int * scale + b + + def all_defaults() -> int: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs() + return obj.int + obj.list[2] + obj.tup[1] + + def some_defaults() -> int: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs(b=[5, 6, 7]) + return obj.int + obj.list[2] + obj.dict[1] + + def override_defaults() -> int: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs(3, [9, 10, 11], (12, 13, 14), {3: 4}, "str") + s: int = obj.int + + for x in obj.list: + s += x + + for y in obj.tup: + s += y + + s += obj.dict[3] + + st = obj.str + if st is not None: + s += len(st) + + return s + + def method_defaults() -> float: + obj: ClassWithDefaultArgs = ClassWithDefaultArgs() + return obj.add(3) + obj.add(3, 0.25) + + self.checkScript(all_defaults, ()) + self.checkScript(some_defaults, ()) + self.checkScript(override_defaults, ()) + self.checkScript(method_defaults, ()) + + # The constructor of this class below has some arguments without default values. + class ClassWithSomeDefaultArgs: # noqa: B903 + def __init__( + self, + a: int, + b: int = 1, + ): + self.a = a + self.b = b + + def default_b() -> int: + obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1) + return obj.a + obj.b + + def set_b() -> int: + obj: ClassWithSomeDefaultArgs = ClassWithSomeDefaultArgs(1, 4) + return obj.a + obj.b + + self.checkScript(default_b, ()) + self.checkScript(set_b, ()) + + # The constructor of this class below has mutable arguments. This should throw + # an error. + class ClassWithMutableArgs: # noqa: B903 + def __init__( + self, + a: List[int] = [1, 2, 3], # noqa: B006 + ): + self.a = a + + def should_fail(): + obj: ClassWithMutableArgs = ClassWithMutableArgs() + + with self.assertRaisesRegex(RuntimeError, "Mutable default parameters are not supported"): + torch.jit.script(should_fail) + def test_staticmethod(self): """ Test static methods on class types. @@ -1067,6 +1167,8 @@ def free_function(x: int) -> int: @torch.jit.script class Properties(object): + __jit_unused_properties__ = ["unsupported"] + def __init__(self, a: int): self.a = a @@ -1074,6 +1176,19 @@ def __init__(self, a: int): def attr(self) -> int: return self.a - 1 + @property + def unsupported(self) -> int: + return sum([self.a]) + + @torch.jit.unused + @property + def unsupported_2(self) -> int: + return sum([self.a]) + + @unsupported_2.setter + def unsupported_2(self, value): + self.a = sum([self.a]) + @attr.setter def attr(self, value: int): self.a = value + 3 diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py index a242217a94c1..aa34c22413ad 100644 --- a/test/jit/test_enum.py +++ b/test/jit/test_enum.py @@ -267,6 +267,26 @@ def forward(self): self.assertEqual(scripted(), Color.RED.value) + def test_string_enum_as_module_attribute(self): + global Color + + class Color(Enum): + RED = "red" + GREEN = "green" + + class TestModule(torch.nn.Module): + def __init__(self, e: Color): + super(TestModule, self).__init__() + self.e = e + + def forward(self): + return (self.e.name, self.e.value) + + m = TestModule(Color.RED) + scripted = torch.jit.script(m) + + self.assertEqual(scripted(), (Color.RED.name, Color.RED.value)) + def test_enum_return(self): global Color diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 2d2c404051f6..696b97059d19 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -237,8 +237,8 @@ def forward(self, x): def test_freeze_module_with_fork2(self): @torch.jit.script - def foo(x, y): - return x * y + def foo(x): + return x * 2 class TestModule(nn.Module): def __init__(self): @@ -247,8 +247,8 @@ def __init__(self): self.b = torch.ones(20, 20) def forward(self, x): - fut = torch.jit._fork(foo, self.a, self.b) - y_hat = foo(self.a, self.b) + fut = torch.jit._fork(foo, self.a) + y_hat = foo(self.b) y = torch.jit._wait(fut) return y_hat + y @@ -272,6 +272,50 @@ def forward(self, x): # conservatively assumes there is a mutation because attributes are # passed to fork subgraph. both 'a' and 'b' are preserved. self.assertTrue(mf.hasattr('a')) + self.assertFalse(mf.hasattr('b')) + output_f = mf.forward(input) + self.assertEqual(output_s, output_f) + + def test_freeze_module_with_fork_calling_module_method(self): + @torch.jit.script + def foo(x, y): + return x * y + + class TestModule(nn.Module): + def __init__(self): + super(TestModule, self).__init__() + self.a = torch.ones(20, 20) + self.b = torch.ones(20, 20) + + @torch.jit.export + def foo(self, x): + return x * self.a + + @torch.jit.export + def bar(self, x): + return x * self.b + + def forward(self, x): + fut = torch.jit._fork(self.foo, self.b) + y_hat = self.bar(self.a) + y = torch.jit._wait(fut) + return y_hat + y + + m = torch.jit.script(TestModule()) + m.eval() + input = torch.randn(2, 2) + output_s = m.forward(input) + mf = torch._C._freeze_module(m._c) + # Check if frozen module looks as below: + # module m { + # attributes { + # self.b = .. + # } + # ... + # TODO: Although there are no mutation, the alias analysis + # conservatively assumes there is a mutation because attributes are + # passed to fork subgraph. 'b' is preserved. + self.assertFalse(mf.hasattr('a')) self.assertTrue(mf.hasattr('b')) output_f = mf.forward(input) self.assertEqual(output_s, output_f) @@ -480,6 +524,77 @@ def forward(self, x): self.assertEqual(output_s, output_f) + def test_freeze_module_with_preserve_sub_module(self): + class SubModule(nn.Module): + def __init__(self): + super(SubModule, self).__init__() + self.a = torch.tensor([1.1]) + self.b = 2.2 + + def forward(self, x): + return self.a + + class TestModule(nn.Module): + def __init__(self): + super(TestModule, self).__init__() + self.sub1 = SubModule() # aliasing + self.sub2 = SubModule() + + def forward(self, x): + return self.sub2(x) + self.sub1(x) + m = TestModule() + ms = torch.jit.script(m) + ms.eval() + mf = torch._C._freeze_module(ms._c, ["sub1"]) + + # Test that 'sub1' is preserved entirely and 'sub2' is completely folded + self.assertTrue(mf.hasattr('sub1')) + self.assertTrue(mf.sub1.hasattr('a')) + self.assertTrue(mf.sub1.hasattr('b')) + self.assertFalse(mf.hasattr('sub2')) + input = torch.randn(2, 2) + output_s = ms.forward(input) + output_f = mf.forward(input) + self.assertEqual(output_s, output_f) + + def test_freeze_module_with_preserve_sub_module_and_mutation(self): + class SubModule(nn.Module): + def __init__(self): + super(SubModule, self).__init__() + self.a = torch.tensor([1.1]) + self.b = 2.2 + + def forward(self, x): + self.a[0] = 3.3 + return self.a + + class TestModule(nn.Module): + def __init__(self): + super(TestModule, self).__init__() + self.sub1 = SubModule() # aliasing + self.sub2 = SubModule() + + def forward(self, x): + return self.sub2(x) + self.sub1(x) + m = TestModule() + ms = torch.jit.script(m) + ms.eval() + mf = torch._C._freeze_module(ms._c, ["sub1"]) + + # Test that be both sub1 and sub1 are preserved and 'b' is preserved + # even if it is not used. To fulfill user request to preserve 'sub1' + self.assertTrue(mf.hasattr('sub1')) + self.assertTrue(mf.sub1.hasattr('a')) + self.assertTrue(mf.sub1.hasattr('b')) + self.assertTrue(mf.hasattr('sub2')) + self.assertTrue(mf.sub2.hasattr('a')) + self.assertTrue(mf.sub2.hasattr('b')) + input = torch.randn(2, 2) + output_s = ms.forward(input) + output_f = mf.forward(input) + self.assertEqual(output_s, output_f) + + def test_freeze_module_with_helperfunction(self): class SubModule(nn.Module): def __init__(self): diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py index a1c378963918..19e4952cad57 100644 --- a/test/jit/test_list_dict.py +++ b/test/jit/test_list_dict.py @@ -408,6 +408,43 @@ def test_over_slice(): return a[3:10] == [3, 4] self.checkScript(test_backward_slice, ()) + def test_slice_index(self): + a = torch.tensor( + [ + [[1, 11], [2, 22]], + [[3, 33], [4, 44]], + [[5, 55], [6, 66]], + ] + ) + + def test_index_slice1(x): + x = x[:, :, [0, 1]] + return x + self.checkScript(test_index_slice1, (a,)) + + def test_index_slice2(x): + x = x[[2, 1, 0], :, :] + return x + self.checkScript(test_index_slice2, (a,)) + + def test_index_slice3(x): + x = x[[0, 1], :, [1]] + return x + self.checkScript(test_index_slice3, (a,)) + + def test_index_slice_empty_list(x): + empty_list: List[int] = [] + x = x[empty_list, :, :] + return x + self.checkScript(test_index_slice_empty_list, (a,)) + + def test_index_slice_out_of_bounds_index(x): + x = x[[4], :, :] + return x + with self.assertRaisesRegex(RuntimeError, "index 4 is out of bounds for dimension 0 with size 3"): + self.checkScript(test_index_slice_out_of_bounds_index, (a,)) + + def test_mutable_list_append(self): def test_append(): a = [0, 1] @@ -1155,6 +1192,11 @@ def annotated_fn(x: torch.Tensor) -> List: with self.assertRaisesRegex(RuntimeError, r"Attempted to use List without a contained type"): torch.jit.script(annotated_fn) + def test_list_none(self): + with self.assertRaisesRegex(RuntimeError, "Can not create ListType with None type"): + x = torch._C.ListType(None) + + class TestDict(JitTestCase): def dict(self): diff --git a/test/jit/test_module_interface.py b/test/jit/test_module_interface.py index 963c1ede8323..f06dafbc1ba2 100644 --- a/test/jit/test_module_interface.py +++ b/test/jit/test_module_interface.py @@ -595,6 +595,58 @@ def forward(self, x): with self.assertRaisesRegex(RuntimeError, "failed to freeze interface attribute 'proxy_mod'"): mf = torch._C._freeze_module(m._c, freezeInterfaces = True) + def test_freeze_module_with_interface_and_fork(self): + class SubModule(torch.nn.Module): + def __init__(self): + super(SubModule, self).__init__() + self.b = torch.tensor([1.5]) + + def forward(self, x): + self.b[0] += 3.2 + return self.b + + class OrigMod(torch.nn.Module): + def __init__(self): + super(OrigMod, self).__init__() + self.a = torch.tensor([0.5]) + + def forward(self, x): + return self.a + + @torch.jit.interface + class ModInterface(torch.nn.Module): + def forward(self, x): + # type: (Tensor) -> Tensor + pass + + class TestModule(torch.nn.Module): + proxy_mod : ModInterface + + def __init__(self): + super(TestModule, self).__init__() + self.proxy_mod = OrigMod() + self.sub = SubModule() + + def forward(self, x): + y = self.proxy_mod(x); + z= self.sub(x) + return y + z + + class MainModule(torch.nn.Module): + def __init__(self): + super(MainModule, self).__init__() + self.test= TestModule(); + + def forward(self, x): + fut = torch.jit._fork(self.test.forward, x) + y = self.test(x) + z = torch.jit._wait(fut) + return y + z + + m = torch.jit.script(MainModule()) + m.eval() + mf = torch._C._freeze_module(m._c, freezeInterfaces = True) + def test_module_apis_interface(self): @torch.jit.interface class ModuleInterface(nn.Module): diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index 50d4351a4870..55604f5ff6bf 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -83,6 +83,7 @@ def test_fuse(a, b): # that guards a tensorexpr group optimized_block = next(g.findNode("prim::If").blocks()) if_nodes = list(optimized_block.findAllNodes("prim::If")) + self.assertEqual(len(if_nodes), 1) FileCheck().check("Group[Subgraph").run(str(if_nodes[0])) # no broadcasts occurred, sum_to_size have been specialized out @@ -191,3 +192,24 @@ def foo(a, b): g = torch.jit.last_executed_optimized_graph() FileCheck().check("fallback_function").check_next("CallFunction").run(g) + + def test_iterative_fusion(self): + @torch.jit.script + def foo(a, b, c, d): + a = a + b + b.add_(3) + c = c + b + d + a = a + 1 + return a, c + + x = torch.ones(1, requires_grad=False) + foo(x, x, x, x) + foo(x, x, x, x) + + # when we iterate through the block, we will start + # by fusing a = a + b with a = a + 1 + # if we were to continue iteration from that fusion point, + # would miss the fusion opportunity of c = c + d + b + + g = torch.jit.last_executed_optimized_graph() + self.assertEqual(len(list(g.findAllNodes("prim::TensorExprGroup"))), 2) diff --git a/test/jit/test_remove_mutation.py b/test/jit/test_remove_mutation.py index ef408e775c33..b747fc06bcde 100644 --- a/test/jit/test_remove_mutation.py +++ b/test/jit/test_remove_mutation.py @@ -200,3 +200,44 @@ def intermediary_use(): # it is possible to remove the append here but don't currently have the logic for it FileCheck().check_not("append").run(graph) self.assertEqual(intermediary_use(), fn()) + + def test_common_pytorch_list_ops(self): + for op in ["cat", "stack", "vstack", "hstack", "dstack"]: + class OpMod(torch.nn.Module): + def __init__(self, op): + super(OpMod, self).__init__() + self.op = torch_op + + def forward(self): + x = torch.tensor([1, 2, 3, 4]) + x.add_(3) + y = [x, x] + return self.op(y) + 3 + + torch_op = getattr(torch, op) + mod = OpMod(torch_op) + mod_script = torch.jit.script(mod) + self.run_pass('remove_mutation', mod_script.forward.graph) + FileCheck().check_not("aten::add_").run(mod_script.forward.graph) + self.assertEqual(mod(), mod_script()) + + # test that the output doesnt alias the input + for inputs in [torch.rand(2, 2)], [torch.rand(2, 2) for _ in range(2)]: + result = torch_op(inputs) + sums = [ten.sum() for ten in result] + + for inp in inputs: + inp.fill_(10) + + self.assertEqual(sums, [ten.sum() for ten in result]) + + + @torch.jit.script + def test_multiple_uses(): + x = torch.tensor([1, 2, 3, 4]) + x.add_(3) + y = [x, x] + return torch.cat(y), y + + self.run_pass('remove_mutation', mod_script.forward.graph) + FileCheck().check("aten::add_").run(test_multiple_uses.graph) diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py index a8bea73c984d..ee288b65551f 100644 --- a/test/jit/test_torchbind.py +++ b/test/jit/test_torchbind.py @@ -12,7 +12,7 @@ pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.append(pytorch_test_dir) from torch.testing._internal.jit_utils import JitTestCase -from torch.testing._internal.common_utils import TEST_WITH_ROCM, IS_WINDOWS, IS_SANDCASTLE, IS_MACOS +from torch.testing._internal.common_utils import TEST_WITH_ROCM, IS_WINDOWS, IS_SANDCASTLE, IS_MACOS, IS_FBCODE from torch.testing import FileCheck if __name__ == "__main__": @@ -24,10 +24,14 @@ class TestTorchbind(JitTestCase): def setUp(self): - if TEST_WITH_ROCM or IS_SANDCASTLE or IS_WINDOWS or IS_MACOS: + if IS_SANDCASTLE or IS_WINDOWS or IS_MACOS or IS_FBCODE: raise unittest.SkipTest("non-portable load_library call used in test") - torch_root = Path(__file__).resolve().parent.parent.parent - p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so' + if TEST_WITH_ROCM: + torch_root = Path(torch.__file__).resolve().parent + p = torch_root / 'lib' / 'libtorchbind_test.so' + else: + torch_root = Path(__file__).resolve().parent.parent.parent + p = torch_root / 'build' / 'lib' / 'libtorchbind_test.so' torch.ops.load_library(str(p)) def test_torchbind(self): diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py index 22921f7d684a..24db4cfe857e 100644 --- a/test/jit/test_tracer.py +++ b/test/jit/test_tracer.py @@ -18,6 +18,7 @@ IS_SANDCASTLE, IS_WINDOWS from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \ _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, RUN_CUDA_MULTI_GPU +from torch.testing._internal.common_cuda import with_tf32_off from typing import List, Tuple from torch import Tensor @@ -900,6 +901,9 @@ def foo(a): self.assertEqual(foo(x), x + x + x) @unittest.skipIf(not RUN_CUDA, "calls .cuda()") + # By default, on Ampere or later GPUs, nn.Linear computes float tensors at TF32 precision. + # We want float tensors to be computed at full precision in order to use the default precision + @with_tf32_off def test_traced_module_cuda(self): class Model(nn.Module): def __init__(self, num_features, num_layers): @@ -1310,6 +1314,39 @@ def check(mod): imported = self.getExportImportCopy(traced) check(imported.foo) + # Note that Bar's forward can only be traced, but not scripted + class Bar(nn.Module): + def __init__(self): + super().__init__() + + @torch.jit.export + def addTwo(self, x): + return x + 2 + + def forward(self, input): + return (lambda a: a + 1)(input) + + # When tracing Bar as a submodule, we only want to script the + # exported methods, and we want to keep the forwards still + # being traced. + class WrapperExports(torch.nn.Module): + def __init__(self): + super(WrapperExports, self).__init__() + self.bar = Bar() + + @torch.jit.export + def addOne(self, x): + return x + 1 + + def forward(self, x): + return self.bar(x) + + f = WrapperExports() + + traced = torch.jit.trace(f, (torch.rand(3, 4),)) + expected_names = ['addOne'] + check(traced) + def test_trace_autograd_function(self): class TestFunc(torch.autograd.Function): @staticmethod diff --git a/test/jit/test_warn.py b/test/jit/test_warn.py new file mode 100644 index 000000000000..6a89ba4dc385 --- /dev/null +++ b/test/jit/test_warn.py @@ -0,0 +1,165 @@ +import os +import sys +import io + +import torch +import warnings +from contextlib import redirect_stderr +from torch.testing import FileCheck + +# Make the helper files in test/ importable +pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(pytorch_test_dir) +from torch.testing._internal.jit_utils import JitTestCase + +if __name__ == '__main__': + raise RuntimeError("This test file is not meant to be run directly, use:\n\n" + "\tpython test/test_jit.py TESTNAME\n\n" + "instead.") + + +class TestWarn(JitTestCase): + def test_warn(self): + @torch.jit.script + def fn(): + warnings.warn("I am warning you") + + f = io.StringIO() + with redirect_stderr(f): + fn() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you", + count=1, + exactly=True) \ + .run(f.getvalue()) + + def test_warn_only_once(self): + @torch.jit.script + def fn(): + for _ in range(10): + warnings.warn("I am warning you") + + f = io.StringIO() + with redirect_stderr(f): + fn() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you", + count=1, + exactly=True) \ + .run(f.getvalue()) + + def test_warn_only_once_in_loop_func(self): + def w(): + warnings.warn("I am warning you") + + @torch.jit.script + def fn(): + for _ in range(10): + w() + + f = io.StringIO() + with redirect_stderr(f): + fn() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you", + count=1, + exactly=True) \ + .run(f.getvalue()) + + def test_warn_once_per_func(self): + def w1(): + warnings.warn("I am warning you") + + def w2(): + warnings.warn("I am warning you") + + @torch.jit.script + def fn(): + w1() + w2() + + f = io.StringIO() + with redirect_stderr(f): + fn() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you", + count=2, + exactly=True) \ + .run(f.getvalue()) + + def test_warn_once_per_func_in_loop(self): + def w1(): + warnings.warn("I am warning you") + + def w2(): + warnings.warn("I am warning you") + + @torch.jit.script + def fn(): + for _ in range(10): + w1() + w2() + + f = io.StringIO() + with redirect_stderr(f): + fn() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you", + count=2, + exactly=True) \ + .run(f.getvalue()) + + def test_warn_multiple_calls_multiple_warnings(self): + @torch.jit.script + def fn(): + warnings.warn("I am warning you") + + f = io.StringIO() + with redirect_stderr(f): + fn() + fn() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you", + count=2, + exactly=True) \ + .run(f.getvalue()) + + def test_warn_multiple_calls_same_func_diff_stack(self): + def warn(caller: str): + warnings.warn("I am warning you from " + caller) + + @torch.jit.script + def foo(): + warn("foo") + + @torch.jit.script + def bar(): + warn("bar") + + f = io.StringIO() + with redirect_stderr(f): + foo() + bar() + + FileCheck() \ + .check_count( + str="UserWarning: I am warning you from foo", + count=1, + exactly=True) \ + .check_count( + str="UserWarning: I am warning you from bar", + count=1, + exactly=True) \ + .run(f.getvalue()) diff --git a/test/jit/test_with.py b/test/jit/test_with.py index 15e1362ea722..ffd0631639f6 100644 --- a/test/jit/test_with.py +++ b/test/jit/test_with.py @@ -359,6 +359,7 @@ def test_with_exceptions(self): Check that exceptions thrown in the bodies of with-statements are handled correctly. """ + global Context @torch.jit.script class Context(object): @@ -379,10 +380,12 @@ def __enter__(self): def __exit__(self, type: Any, value: Any, tb: Any): self.count.sub_(0.3) + @torch.jit.script def method_that_raises(): # type: () -> Tensor - raise Exception() + raise Exception("raised exception") + @torch.jit.script def test_exception(x, c): # type: (Tensor, Context) -> Tensor """ @@ -393,6 +396,7 @@ def test_exception(x, c): return x + @torch.jit.script def test_exception_nested(x, c): # type: (Tensor, Context) -> Tensor """ @@ -404,6 +408,7 @@ def test_exception_nested(x, c): return x + @torch.jit.script def with_that_raises(c): # type: (Context) -> Tensor a = torch.tensor([1]) @@ -413,6 +418,7 @@ def with_that_raises(c): return a + @torch.jit.script def test_exception_fn_call(x, c): # type: (Tensor, Context) -> Tensor """ @@ -426,15 +432,18 @@ def test_exception_fn_call(x, c): c = Context(1) - with self.assertRaises(Exception): + # checkScript and checkScriptRaisesRegex cannot be used because the string frontend will + # not compile class types (of which Context, the context manager being used for this test + # is one). + with self.assertRaisesRegex(Exception, r"raised exception"): test_exception(torch.randn(2), c) self.assertEqual(c.count, 1) - with self.assertRaises(Exception): + with self.assertRaisesRegex(Exception, r"raised exception"): test_exception_nested(torch.randn(2), c) self.assertEqual(c.count, 1) - with self.assertRaises(Exception): + with self.assertRaisesRegex(Exception, r"raised exception"): test_exception_fn_call(torch.randn(2), c) self.assertEqual(c.count, 1) diff --git a/test/module_a.py b/test/module_a.py new file mode 100644 index 000000000000..685af9bc1569 --- /dev/null +++ b/test/module_a.py @@ -0,0 +1 @@ +result = 'module_a' diff --git a/torch/csrc/jit/tensorexpr/buffer.cpp b/test/namespace_b/subpackage.py similarity index 100% rename from torch/csrc/jit/tensorexpr/buffer.cpp rename to test/namespace_b/subpackage.py diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect index cde473fcdb4d..1479846789d4 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy.expect @@ -8,6 +8,11 @@ graph { output: "2" name: "SoftmaxCrossEntropyLoss_0" op_type: "SoftmaxCrossEntropyLoss" + attribute { + name: "ignore_index" + i: -100 + type: INT + } attribute { name: "reduction" s: "mean" diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect index 58d8c805163d..f5cfba35b032 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d.expect @@ -8,6 +8,11 @@ graph { output: "2" name: "SoftmaxCrossEntropyLoss_0" op_type: "SoftmaxCrossEntropyLoss" + attribute { + name: "ignore_index" + i: -100 + type: INT + } attribute { name: "reduction" s: "mean" diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect index 10d47a6ed84d..8b0ec04b24c8 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_3d_none.expect @@ -8,6 +8,11 @@ graph { output: "2" name: "SoftmaxCrossEntropyLoss_0" op_type: "SoftmaxCrossEntropyLoss" + attribute { + name: "ignore_index" + i: -100 + type: INT + } attribute { name: "reduction" s: "none" diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect index 6ccab9f7b50f..8d3539ca1c64 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_4d.expect @@ -8,6 +8,11 @@ graph { output: "2" name: "SoftmaxCrossEntropyLoss_0" op_type: "SoftmaxCrossEntropyLoss" + attribute { + name: "ignore_index" + i: -100 + type: INT + } attribute { name: "reduction" s: "mean" diff --git a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect index 1ea4adac8cab..bf1667b58812 100644 --- a/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect +++ b/test/onnx/expect/TestOperators.test_softmaxcrossentropy_weights.expect @@ -9,6 +9,11 @@ graph { output: "3" name: "SoftmaxCrossEntropyLoss_0" op_type: "SoftmaxCrossEntropyLoss" + attribute { + name: "ignore_index" + i: -100 + type: INT + } attribute { name: "reduction" s: "mean" diff --git a/test/onnx/expect/TestOperators.test_view.expect b/test/onnx/expect/TestOperators.test_view.expect index 75202b5d0da2..abd2276e7716 100644 --- a/test/onnx/expect/TestOperators.test_view.expect +++ b/test/onnx/expect/TestOperators.test_view.expect @@ -3,16 +3,26 @@ producer_name: "pytorch" producer_version: "CURRENT_VERSION" graph { node { - input: "0" output: "1" - name: "Flatten_0" - op_type: "Flatten" + name: "Constant_0" + op_type: "Constant" attribute { - name: "axis" - i: 1 - type: INT + name: "value" + t { + dims: 2 + data_type: 7 + raw_data: "\001\000\000\000\000\000\000\000\001\000\000\000\000\000\000\000" + } + type: TENSOR } } + node { + input: "0" + input: "1" + output: "2" + name: "Reshape_1" + op_type: "Reshape" + } name: "torch-jit-export" input { name: "0" @@ -28,7 +38,7 @@ graph { } } output { - name: "1" + name: "2" type { tensor_type { elem_type: 1 diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect index 07667797e2cf..5ae9c0576c7a 100644 --- a/test/onnx/expect/TestOperators.test_view_flatten.expect +++ b/test/onnx/expect/TestOperators.test_view_flatten.expect @@ -65,60 +65,40 @@ graph { } } node { - input: "6" output: "7" - name: "Cast_6" - op_type: "Cast" - attribute { - name: "to" - i: 11 - type: INT - } - } - node { - output: "8" - name: "Constant_7" + name: "Constant_6" op_type: "Constant" attribute { name: "value" t { - data_type: 11 - raw_data: "\000\000\000\000\000\000\360?" + data_type: 7 + raw_data: "\030\000\000\000\000\000\000\000" } type: TENSOR } } node { - input: "8" input: "7" - output: "9" - name: "Div_8" + input: "6" + output: "8" + name: "Div_7" op_type: "Div" } node { - output: "10" - name: "Constant_9" - op_type: "Constant" + input: "8" + output: "9" + name: "Cast_8" + op_type: "Cast" attribute { - name: "value" - t { - data_type: 11 - raw_data: "\000\000\000\000\000\0008@" - } - type: TENSOR + name: "to" + i: 7 + type: INT } } node { input: "9" - input: "10" - output: "11" - name: "Mul_10" - op_type: "Mul" - } - node { - input: "11" - output: "12" - name: "Cast_11" + output: "10" + name: "Cast_9" op_type: "Cast" attribute { name: "to" @@ -128,8 +108,8 @@ graph { } node { input: "3" - output: "13" - name: "Unsqueeze_12" + output: "11" + name: "Unsqueeze_10" op_type: "Unsqueeze" attribute { name: "axes" @@ -138,9 +118,9 @@ graph { } } node { - input: "12" - output: "14" - name: "Unsqueeze_13" + input: "10" + output: "12" + name: "Unsqueeze_11" op_type: "Unsqueeze" attribute { name: "axes" @@ -149,10 +129,10 @@ graph { } } node { - input: "13" - input: "14" - output: "15" - name: "Concat_14" + input: "11" + input: "12" + output: "13" + name: "Concat_12" op_type: "Concat" attribute { name: "axis" @@ -162,9 +142,9 @@ graph { } node { input: "0" - input: "15" - output: "16" - name: "Reshape_15" + input: "13" + output: "14" + name: "Reshape_13" op_type: "Reshape" } name: "torch-jit-export" @@ -191,7 +171,7 @@ graph { } } output { - name: "16" + name: "14" type { tensor_type { elem_type: 1 diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py index 6f37fa6d7e72..f91f6bea165b 100644 --- a/test/onnx/test_models.py +++ b/test/onnx/test_models.py @@ -49,7 +49,6 @@ class TestModels(TestCase): opset_version = _export_onnx_opset_version def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7): - self.is_script_test_enabled = True with torch.onnx.select_model_mode_for_export(model, None): graph = torch.onnx.utils._trace(model, inputs, OperatorExportTypes.ONNX) torch._C._jit_pass_lint(graph) @@ -94,14 +93,12 @@ def test_srresnet(self): self.exportTest(toC(SRResNet(rescale_factor=4, n_filters=64, n_blocks=8)), toC(x)) @skipIfNoLapack - @disableScriptTest() def test_super_resolution(self): x = Variable( torch.randn(BATCH_SIZE, 1, 224, 224).fill_(1.0) ) self.exportTest(toC(SuperResolutionNet(upscale_factor=3)), toC(x), atol=1e-6) - @disableScriptTest() def test_alexnet(self): x = Variable( torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0) @@ -137,13 +134,12 @@ def test_vgg19_bn(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(vgg19_bn()), toC(x)) - @disableScriptTest() def test_resnet(self): # ResNet50 model x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(resnet50()), toC(x), atol=1e-6) - @disableScriptTest() + @disableScriptTest() # None type in outputs def test_inception(self): x = Variable( torch.randn(BATCH_SIZE, 3, 299, 299) + 1.) @@ -208,22 +204,20 @@ def test_qat_resnet(self): self.exportTest(toC(qat_resnet50), toC(x)) - @disableScriptTest() + @disableScriptTest() # None type in outputs def test_googlenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(googlenet()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() def test_mnasnet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(mnasnet1_0()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() def test_mobilenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(mobilenet_v2()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() + @disableScriptTest() # prim_data def test_shufflenet(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(shufflenet_v2_x1_0()), toC(x), rtol=1e-3, atol=1e-5) @@ -238,20 +232,18 @@ def test_deeplab(self): x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0)) self.exportTest(toC(deeplabv3_resnet101()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() def test_r3d_18_video(self): x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0)) self.exportTest(toC(r3d_18()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() def test_mc3_18_video(self): x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0)) self.exportTest(toC(mc3_18()), toC(x), rtol=1e-3, atol=1e-5) - @disableScriptTest() def test_r2plus1d_18_video(self): x = Variable(torch.randn(1, 3, 4, 112, 112).fill_(1.0)) self.exportTest(toC(r2plus1d_18()), toC(x), rtol=1e-3, atol=1e-5) + if __name__ == '__main__': run_tests() diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py index 657a1479723d..c916b60844d1 100644 --- a/test/onnx/test_models_onnxruntime.py +++ b/test/onnx/test_models_onnxruntime.py @@ -15,13 +15,31 @@ def exportTest(self, model, inputs, rtol=1e-2, atol=1e-7, opset_versions=None): input=inputs, rtol=rtol, atol=atol) if self.is_script_test_enabled and opset_version > 11: + TestModels.use_new_jit_passes = True + TestModels.onnx_shape_inference = True + outputs = model(inputs) script_model = torch.jit.script(model) run_model_test(self, script_model, False, example_outputs=outputs, - input=inputs, rtol=rtol, atol=atol, use_new_jit_passes=True) + input=inputs, rtol=rtol, atol=atol) + + +TestModels = type(str("TestModels"), + (unittest.TestCase,), + dict(TestModels.__dict__, + is_script_test_enabled=False, + exportTest=exportTest)) + + +# model tests for scripting with new JIT APIs and shape inference +TestModels_new_jit_API = type(str("TestModels_new_jit_API"), + (unittest.TestCase,), + dict(TestModels.__dict__, + exportTest=exportTest, + is_script_test_enabled=True, + use_new_jit_passes=True, + onnx_shape_inference=True)) if __name__ == '__main__': - TestModels.is_script_test_enabled = True - TestModels.exportTest = exportTest unittest.main() diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 816951dfc79e..23d4879a8a4c 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -15,6 +15,7 @@ skipIfUnsupportedMaxOpsetVersion, skipIfONNXShapeInference) from test_pytorch_common import BATCH_SIZE from test_pytorch_common import RNN_BATCH_SIZE, RNN_SEQUENCE_LENGTH, RNN_INPUT_SIZE, RNN_HIDDEN_SIZE +from typing import List import model_defs.word_language_model as word_language_model import torchvision import onnx @@ -189,6 +190,7 @@ def run_model_test_with_external_data(self, model, input, rtol=0.001, atol=1e-7, ort_outs = run_ort(ort_sess, input_copy) ort_compare_with_pytorch(ort_outs, output, rtol, atol) + @skipIfUnsupportedMinOpsetVersion(9) # Because external data format was released with Opset 9. def test_embedding_model_with_external_data(self): class LargeModel(torch.nn.Module): @@ -315,7 +317,7 @@ def run_word_language_model(self, model_name): self.run_test(model, (x, model.hidden)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @disableScriptTest() # Faster RCNN model is not scriptable def test_faster_rcnn(self): model = torchvision.models.detection.faster_rcnn.fasterrcnn_resnet50_fpn(pretrained=True, min_size=200, max_size=300) @@ -380,27 +382,53 @@ def test_word_language_model_LSTM(self): def test_word_language_model_GRU(self): self.run_word_language_model("GRU") - @disableScriptTest() def test_index_1d(self): - self._test_index_generic(lambda input: input[0]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[0] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), m1) - @disableScriptTest() def test_index_2d_1dimslice(self): - self._test_index_generic(lambda input: input[0:1, :]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[0:1, :] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), m1) - @disableScriptTest() def test_index_2d_sliceint(self): - self._test_index_generic(lambda input: input[1, :]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[1, :] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), m1) - @disableScriptTest() def test_index_2d_neg_slice(self): - self._test_index_generic(lambda input: input[0:-1, :]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[0:-1, :] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), m1) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() def test_index_mask(self): - self._test_index_generic(lambda input: input[torch.tensor([0, 1, 0], dtype=torch.uint8)]) - self._test_index_generic(lambda input: input[torch.tensor([0, 1, 0], dtype=torch.bool)]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[torch.tensor([0, 1, 0], dtype=torch.uint8)] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), m1) + + class MyModel(torch.nn.Module): + def forward(self, input): + return input[torch.tensor([0, 1, 0], dtype=torch.bool)] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), m1) @disableScriptTest() def test_dict(self): @@ -612,6 +640,20 @@ def forward(self, input1, input2, input3): self.run_test(TraceModel(), (x1, x2, x3), atol=10e-5) self.run_test(ScriptModel(), (x1, x2, x3), atol=10e-5) + def test_conv_shape_inference(self): + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.conv2 = torch.nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)) + + def forward(self, input): + return self.conv2(input) + 2 + + x = torch.randn(20, 16, 50, 100) + self.run_test(Model(), x, atol=10e-5, + input_names=['x'], + dynamic_axes={'x': [0]}) + def test_conv_transpose(self): class TraceModel(torch.nn.Module): def __init__(self): @@ -660,14 +702,18 @@ def forward(self, x): def squeeze_model_tests(self, d, x1, x2): class Squeeze(torch.nn.Module): + def __init__(self, d): + super(Squeeze, self).__init__() + self.d = d + def forward(self, x): - if d is not None: - return torch.squeeze(x, dim=d) + if self.d is not None: + return torch.squeeze(x, dim=self.d) else: return torch.squeeze(x) x2 = [] if x2 is None else [x2] - self.run_test(Squeeze(), x1, input_names=['input'], dynamic_axes={'input': {0: '0', 1: '1', 2: '2'}}, test_with_inputs=x2) + self.run_test(Squeeze(d), x1, input_names=['input'], dynamic_axes={'input': {0: '0', 1: '1', 2: '2'}}, test_with_inputs=x2) def test_squeeze_without_no_op(self): x = torch.randn(2, 1, 4) @@ -761,7 +807,7 @@ def test_maxpool_3d_ceil(self): self.run_test(model, x) @skipIfUnsupportedMinOpsetVersion(8) - @disableScriptTest() + @disableScriptTest() # Functional module not scriptable def test_maxpool_with_indices(self): model = torch.nn.MaxPool1d(2, stride=1, return_indices=True) x = torch.randn(20, 16, 50) @@ -814,7 +860,6 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(ArithmeticModule(), x) - @disableScriptTest() # In scripting the first transpose node do not carry shape and dtype info. # The following test only works when onnx shape inference is enabled. @skipIfONNXShapeInference(False) @@ -868,7 +913,7 @@ def forward(self, x): def test_div(self): class DivModule(torch.nn.Module): def forward(self, x, y): - return x / y + return x / y, torch.true_divide(x, y) x = torch.randn(2, 3, 4).to(torch.int) y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int) @@ -882,7 +927,7 @@ def forward(self, x, y): def test_div_promotion_trace(self): class DivModule(torch.nn.Module): def forward(self, x, y): - return x / y + return x / y, torch.true_divide(x, y) x = torch.randn(2, 3, 4).to(torch.int) y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int) @@ -900,14 +945,14 @@ def forward(self, x, y): # In scripting x, y do not carry shape and dtype info. # The following test only works when onnx shape inference is enabled. @skipIfONNXShapeInference(False) - def test_true_div_script(self): - class TrueDivModule(torch.nn.Module): + def test_div_promotion_script(self): + class DivModule(torch.nn.Module): def forward(self, x, y): # Add transpose to hide shape/type information # Otherwise shape and type are still avaiable from input. x = x.transpose(1, 2) y = y.transpose(1, 2) - return torch.true_divide(x, y) + return x / y, torch.true_divide(x, y) x = torch.randn(2, 3, 4).to(torch.int) y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.int) @@ -918,20 +963,20 @@ def forward(self, x, y): # This can be handled by the default case, where both are cast to float. # It works even if type of x, y are unknown. torch.set_default_dtype(torch.float) - self.run_test(torch.jit.script(TrueDivModule()), (x, y)) + self.run_test(torch.jit.script(DivModule()), (x, y)) # 2. x,y are int, and output is double. # This can be handled by the default case, where both are cast to double. # It works even if type of x, y are unknown. torch.set_default_dtype(torch.double) - self.run_test(torch.jit.script(TrueDivModule()), (x, y)) + self.run_test(torch.jit.script(DivModule()), (x, y)) # 3. x is int, y is double, and output is double. # This can only be handled when both type of x and y are known. torch.set_default_dtype(prev_default) x = torch.randn(2, 3, 4).to(torch.int) y = torch.arange(1, 2 * 3 * 4 + 1).reshape(2, 3, 4).to(torch.double) - self.run_test(torch.jit.script(TrueDivModule()), (x, y)) + self.run_test(torch.jit.script(DivModule()), (x, y)) def test_slice_trace(self): class MyModule(torch.nn.Module): @@ -977,7 +1022,7 @@ def forward(self, x, y): self.run_test(InputIndexSlice(), (x, y)) @skipIfUnsupportedMinOpsetVersion(10) - @disableScriptTest() + @disableScriptTest() # scripting tuple/list append def test_slice_dynamic(self): class DynamicSliceExportMod(torch.nn.Module): def forward(self, x): @@ -1014,7 +1059,7 @@ def forward(self, x): self.run_test(DynamicSliceModel(), x) @skipIfUnsupportedMinOpsetVersion(10) - @disableScriptTest() + @disableScriptTest() # scripting tuple/list append def test_slice_dynamic_to_end(self): class DynamicSliceExportMod(torch.nn.Module): def forward(self, x): @@ -1121,7 +1166,7 @@ def forward(self, input): self.run_test(SizeModel(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @disableScriptTest() # x.stride() not scriptable def test_as_strided(self): class Model(torch.nn.Module): def forward(self, x): @@ -1134,28 +1179,42 @@ def forward(self, x): x = torch.randn(5, 8, 7) self.run_test(Model(), x) - def _test_index_generic(self, fn): + @disableScriptTest() # Ellipses followed by tensor indexing not scriptable + def test_tensor_index_advanced_indexing_ellipsis(self): class MyModel(torch.nn.Module): - def __init__(self): - super(MyModel, self).__init__() - def forward(self, input): - return fn(input) + return input[..., torch.tensor([2, 1]), torch.tensor([0, 3])] m1 = torch.randn(3, 4, 5, 6, 7) - self.run_test(MyModel(), m1) + self.run_test(MyModel(), (m1,)) - @disableScriptTest() def test_tensor_index_advanced_indexing(self): - self._test_index_generic( - lambda input: input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])]) - self._test_index_generic(lambda input: input[..., torch.tensor([2, 1]), torch.tensor([0, 3])]) - self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])]) - self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[:, torch.tensor([[0, 2], [1, 1]]), :, torch.tensor([2, 1]), torch.tensor([0, 3])] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), (m1,)) + + class MyModel(torch.nn.Module): + def forward(self, input): + return input[:, torch.tensor([0, 2]), None, 2:4, torch.tensor([[1, 3], [4, 0]])] + + self.run_test(MyModel(), (m1,)) + + class MyModel(torch.nn.Module): + def forward(self, input): + return input[:, torch.tensor([0, 2]), torch.tensor([1]), 2:4, torch.tensor([[1], [4]])] + + self.run_test(MyModel(), (m1,)) - @disableScriptTest() def test_tensor_index_advanced_indexing_consecutive(self): - self._test_index_generic(lambda input: input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None]) + class MyModel(torch.nn.Module): + def forward(self, input): + return input[:, torch.tensor([0, 2]), torch.tensor([[1, 3], [4, 0]]), None] + + m1 = torch.randn(3, 4, 5, 6, 7) + self.run_test(MyModel(), (m1,)) @skipIfUnsupportedMinOpsetVersion(11) def test_index_put(self): @@ -1181,7 +1240,6 @@ def forward(self, x, ind, update): self.run_test(IndexPutModel(), (x, ind, update)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() def test_index_put_slice_index(self): class IndexPutModel(torch.nn.Module): def forward(self, x, update): @@ -1256,7 +1314,7 @@ def forward(self, x, update): self.run_test(IndexPutModel8(), (x, update)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @disableScriptTest() # Ellipses followed by tensor indexing not scriptable def test_index_put_ellipsis(self): class IndexPutModel(torch.nn.Module): def forward(self, x, update): @@ -1277,7 +1335,6 @@ def forward(self, x, update): self.run_test(IndexPutModel2(), (x, update)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() def test_copy_(self): class CopyModel(torch.nn.Module): def forward(self, x, data): @@ -1319,9 +1376,6 @@ def forward(self, x, data): update = torch.randn(2) self.run_test(CopyModel3(), (x, update)) - update = torch.randn(1, 2) - self.run_test(CopyModel3(), (x, update)) - class CopyModel4(torch.nn.Module): def forward(self, x, ind, data): x[ind] = data @@ -1333,7 +1387,18 @@ def forward(self, x, ind, data): self.run_test(CopyModel4(), (x, ind, data)) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @disableScriptTest() # Model not scriptable (output with shape doesn't match the broadcast shape) + def test_copy_tracing(self): + class CopyModel(torch.nn.Module): + def forward(self, x, data): + x[1, 1:3] = data + return x + + x = torch.randn(3, 4) + update = torch.randn(1, 2) + self.run_test(CopyModel(), (x, update)) + + @skipIfUnsupportedMinOpsetVersion(11) def test_copy_ellipsis(self): class CopyModel(torch.nn.Module): def forward(self, x, update): @@ -1348,14 +1413,18 @@ def forward(self, x, update): update = torch.ones(1) self.run_test(CopyModel(), (x, update)) - class CopyModel2(torch.nn.Module): + @skipIfUnsupportedMinOpsetVersion(11) + @disableScriptTest() # Missing input size (with ellipsis indexing) + def test_copy_ellipsis_tracing(self): + class CopyModel(torch.nn.Module): def forward(self, x, update): x[2, ..., 1:3] = update return x x = torch.randn(3, 4, 5, 6) + update = torch.ones(1) - self.run_test(CopyModel2(), (x, update)) + self.run_test(CopyModel(), (x, update)) @skipIfUnsupportedMinOpsetVersion(10) def test_flip(self): @@ -1381,8 +1450,8 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Rand(), x) - @disableScriptTest() @skipIfUnsupportedMinOpsetVersion(9) + @disableScriptTest() # symbolic update for randn def test_random_dynamic_size(self): class RandN(torch.nn.Module): def forward(self, x): @@ -1415,7 +1484,6 @@ def forward(self, x): self.run_test(RandLike(), x) self.run_test(torch.jit.script(RandLike()), x) - @disableScriptTest() def test_random_like_dtype(self): class RandNLike(torch.nn.Module): def forward(self, x): @@ -1711,6 +1779,15 @@ def forward(self, input): x = torch.randn(3, 3, requires_grad=True) self.run_test(NarrowModel(), x) + @skipIfUnsupportedMinOpsetVersion(11) + def test_narrow_dynamic(self): + class NarrowModel(torch.nn.Module): + def forward(self, input): + return torch.narrow(input, 0, 0, input.shape[0] - 1) + + x = torch.randn(3, 3, requires_grad=True) + self.run_test(NarrowModel(), x) + @skipIfUnsupportedMinOpsetVersion(9) def test_index_fill(self): class IndexFillModel(torch.nn.Module): @@ -1758,7 +1835,6 @@ def forward(self, x): x = torch.randn(3, 4) self.run_test(IndexSelectScalerIndexModel(), x) - @disableScriptTest() def test_index_select_scaler_index(self): class IndexSelectScalerIndexModel(torch.nn.Module): def __init__(self, index_base): @@ -1817,7 +1893,6 @@ def forward(self, x, k): self.run_test(MyModuleDynamic(), [x, k]) @skipIfUnsupportedOpsetVersion([7]) - @disableScriptTest() def test_normalize(self): class Model(torch.nn.Module): def forward(self, x): @@ -1954,7 +2029,6 @@ def forward(self, input, indices): self.run_test(GatherModel(), input=(input, indices)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() def test_expand(self): class ExpandModel(torch.nn.Module): def forward(self, input): @@ -1975,7 +2049,7 @@ def forward(self, input, size): return input.expand(size) input = torch.randn(3,) - size = torch.tensor([-1]) + size = torch.tensor(-1) self.run_test(ExpandTensorSizeModel(), input=(input, size)) def test_multinomial(self): @@ -2105,6 +2179,7 @@ def test_logsoftmax_dim(self): self.run_test(model, input) @skipIfUnsupportedMinOpsetVersion(9) + @disableScriptTest() # scripting prim_dtype def test_lstm_no_hidden(self): class LSTMModel(torch.nn.Module): def __init__(self): @@ -2134,7 +2209,7 @@ def test_lstm_default_init_state(self): self.run_test(model, input) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @disableScriptTest() # LSTMModel model not scriptable def test_lstm_fixed_batch_size(self): class LSTMModel(torch.nn.Module): def __init__(self): @@ -2588,6 +2663,17 @@ def forward(self, input, other): shape = torch.randn(6, 4) self.run_test(ViewModel(), (x, shape)) + def test_view_dynamic_zero_dim(self): + class ViewModel(torch.nn.Module): + def forward(self, input): + input = input.view(-1, 2) + return input.view(1, -1) + + x = torch.ones(2) + another_x = torch.empty((0,)) + self.run_test(ViewModel(), x, test_with_inputs=[another_x], + input_names=['input_1'], dynamic_axes={'input_1': [0, ]}) + def test_view_as(self): class ViewModel(torch.nn.Module): def forward(self, input, other): @@ -2597,7 +2683,7 @@ def forward(self, input, other): y = torch.randn(6, 4) self.run_test(ViewModel(), (x, y)) - @disableScriptTest() + @disableScriptTest() # ONNX Shape inference failure in if/else block for Gemm def test_weight_norm(self): model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1) x = torch.randn(3, 4, 5, requires_grad=True) @@ -2615,7 +2701,7 @@ def test_weight_norm(self): x = torch.randn(3, 3, 5, requires_grad=True) self.run_test(model, x) - @disableScriptTest() + @disableScriptTest() # ONNX Shape inference failure in if/else block for Gemm def test_weight_norm_nodim(self): model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None) x = torch.randn(3, 4, 5, requires_grad=True) @@ -2645,7 +2731,6 @@ def forward(self, x): x = torch.randint(10, (1, 2, 3, 4)) self.run_test(FlattenModel(), x) - @disableScriptTest() @skipIfUnsupportedMinOpsetVersion(9) def test_flatten_dynamic_axes(self): class MyModule(torch.nn.Module): @@ -2680,11 +2765,11 @@ def forward(self, x, y, z, ind): ind = torch.tensor(-2, dtype=torch.long) self.run_test(GetItemModel(), (x, y, z, ind)) - @disableScriptTest() def test_unbind(self): class UnbindModel(torch.nn.Module): def forward(self, input): - return input.unbind() + _, out, _ = input.unbind() + return out x = torch.randn(3, 4, 5) self.run_test(UnbindModel(), x) @@ -2721,7 +2806,7 @@ def test_len_list(self): class LenListModel(torch.jit.ScriptModule): @torch.jit.script_method def forward(self, input): - return torch.ones(len(input.shape)) + return torch.ones(len(input.shape)) x = torch.randn(4, 5) self.run_test(LenListModel(), x) @@ -2744,18 +2829,19 @@ def forward(self, input): x = torch.randn(3, 4, 5) self.run_test(UnbindModel2(), x) - @disableScriptTest() def test_split(self): class SplitModel(torch.nn.Module): def forward(self, input): - return input.split([2, 1, 2]) + out1, out2, out3 = input.split([2, 1, 2]) + return out1, out2, out3 x = torch.randn(5, 4, 3) self.run_test(SplitModel(), x) class SplitModel2(torch.nn.Module): def forward(self, input): - return input.split([2, 1, 1], -2) + out1, out2, out3 = input.split([2, 1, 1], -2) + return out1, out2, out3 x = torch.randn(5, 4, 3) self.run_test(SplitModel2(), x) @@ -2772,18 +2858,20 @@ def forward(self, input): @disableScriptTest() def test_split_size_as_list(self): class SplitModel(torch.nn.Module): - def forward(self, input): + def forward(self, input, split_sizes: List[int]): out = [] - split_sizes = [input.shape[0] - 1, 1] - for ob in input.split(split_sizes): + split_list: List[torch.Tensor] = input.split(split_sizes) + + for ob in split_list: out.append(ob) return torch.cat(out, dim=0) - x = torch.randn(5, 4, 3) - self.run_test(SplitModel(), x) + x = torch.randn(6, 4, 3) + split_sizes = [torch.tensor(2), torch.tensor(4)] + self.run_test(SplitModel(), (x, split_sizes)) @skipIfUnsupportedMinOpsetVersion(11) - def test_split_size_list_to_slice(self): + def test_split_size_with_slice(self): class SplitModule(torch.nn.Module): def forward(self, x, y, t): splits = (x.size(1), y.size(1)) @@ -2991,7 +3079,6 @@ def forward(self, x): self.run_test(Zero_(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() def test_new_zeros(self): class Zero_(torch.nn.Module): def forward(self, x): @@ -3056,6 +3143,17 @@ def forward(self, x): x = torch.randn(2, 3, 4) self.run_test(Full(), x) + @skipIfUnsupportedMinOpsetVersion(9) + def test_inplace_list(self): + class Arithmetic(torch.jit.ScriptModule): + @torch.jit.script_method + def forward(self, x, y): + return torch.cat([x.add_(3), y.fill_(0)]) + + x = torch.randn(2, 3) + y = torch.randn(2, 3) + self.run_test(Arithmetic(), (x, y)) + @skipIfUnsupportedMinOpsetVersion(9) def test_inplace_fill(self): class Fill_(torch.nn.Module): @@ -3119,6 +3217,28 @@ def forward(self, x): x = torch.arange(16).view(2, 2, 4).to(torch.float32) self.run_test(MaskedFillModel2(), x) + @skipIfUnsupportedMinOpsetVersion(9) + def test_masked_fill_inplace(self): + + class MaskedFillModel(torch.jit.ScriptModule): + @torch.jit.script_method + def forward(self, x): + mask = torch.tensor([[0, 0, 1], [1, 1, 0]], dtype=torch.uint8) + x.masked_fill_(mask, 2) + return x + + x = torch.zeros(4, 2, 3, requires_grad=True) + self.run_test(MaskedFillModel(), x) + + class MaskedFillModel2(torch.jit.ScriptModule): + @torch.jit.script_method + def forward(self, x): + x.masked_fill_(x > 3, -1) + return x + + x = torch.arange(16).view(2, 2, 4).to(torch.float32) + self.run_test(MaskedFillModel2(), x) + @skipIfUnsupportedMinOpsetVersion(11) def test_masked_scatter(self): class MaskedScatterModel(torch.nn.Module): @@ -3147,7 +3267,6 @@ def forward(self, x): self.run_test(PixelShuffle(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() def test_scalar_type(self): class ArithmeticModel(torch.nn.Module): def forward(self, x): @@ -3194,7 +3313,7 @@ def forward(self, x): self.run_test(FullModel(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @disableScriptTest() # dtype mismatch def test_full_like(self): class FullLikeModel(torch.nn.Module): def forward(self, x): @@ -3204,7 +3323,7 @@ def forward(self, x): self.run_test(FullLikeModel(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() + @disableScriptTest() # dtype mismatch def test_full_like_value(self): class FullLikeModel(torch.nn.Module): def forward(self, x, y): @@ -3378,28 +3497,9 @@ def forward(self, input): x = torch.tensor([False, True, True]) self.run_test(model, x) - @unittest.skip("Enable once jit trace Tensor.numel as constant is fixed.") - def test_embedding_bag_dynamic(self): - class EmbeddingModel(torch.nn.Module): - def __init__(self): - super().__init__() - self.embeddingbag = torch.nn.EmbeddingBag(40, 12, mode='sum') - - def forward(self, input): - return self.embeddingbag(input) - - model = EmbeddingModel() - x = torch.randint(7, (10, 5)) - y = torch.randint(10, (20, 5)) - self.run_test(model, x, test_with_inputs=[y], - input_names=['input'], - output_names=['output'], - dynamic_axes={'input': [0], - 'output': [0] - }) - - @disableScriptTest() + @disableScriptTest() # error in propagate as assign input shape @skipIfUnsupportedMinOpsetVersion(10) + @skipIfUnsupportedOpsetVersion([12]) # Due to ONNX Loop shape inference issue def test_embedding_bag(self): model = torch.nn.EmbeddingBag(10, 5, mode='sum', scale_grad_by_freq=True) input = torch.randint(10, (7,)) @@ -3415,27 +3515,29 @@ def test_embedding_bag(self): input = torch.randint(10, (7, 5)) self.run_test(model, (input)) - @disableScriptTest() + @disableScriptTest() # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast @skipIfUnsupportedMinOpsetVersion(10) + @skipIfUnsupportedOpsetVersion([12]) # Due to ONNX Loop shape inference issue def test_embedding_bag_1d_per_sample_weights(self): class EmbeddingModel(torch.nn.Module): def forward(self, embedding_matrix, input, offset, weights): - return torch.nn.functional.embedding_bag(embedding_matrix, input, offsets=offset, + return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offset, mode='sum', per_sample_weights=weights) model = EmbeddingModel() x = torch.randint(7, (6,)) - w = torch.randn(6,) + w = torch.randn(6, ) offset = torch.tensor([0, 2, 5]) embedding_matrix = torch.rand(10, 15) self.run_test(model, (embedding_matrix, x, offset, w)) - @disableScriptTest() + @disableScriptTest() # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast @skipIfUnsupportedMinOpsetVersion(10) + @skipIfUnsupportedOpsetVersion([12]) # Due to ONNX Loop shape inference issue def test_embedding_bag_2d_per_sample_weights(self): class EmbeddingModel(torch.nn.Module): def forward(self, embedding_matrix, input, weights): - return torch.nn.functional.embedding_bag(embedding_matrix, input, + return torch.nn.functional.embedding_bag(input, embedding_matrix, mode='sum', per_sample_weights=weights) embedding_matrix = torch.rand(10, 15) @@ -3444,12 +3546,52 @@ def forward(self, embedding_matrix, input, weights): w = torch.randn(2, 3) self.run_test(model, (embedding_matrix, x, w)) + @disableScriptTest() # scripting prim::Uninitialized, prim::dtype, prim::unchecked_cast + @skipIfUnsupportedMinOpsetVersion(11) + @unittest.skip("Due to ONNX Loop shape inference issue.") + def test_embedding_bag_dynamic_input(self): + class EmbeddingModel1D(torch.nn.Module): + def forward(self, embedding_matrix, input, weights, offsets): + return torch.nn.functional.embedding_bag(input, embedding_matrix, offsets=offsets, + mode='sum', per_sample_weights=weights) + + model = EmbeddingModel1D() + x = torch.randint(7, (6,)) + w = torch.randn(6, ) + offsets = torch.tensor([0, 2, 5], dtype=torch.long) + embedding_matrix = torch.rand(10, 15) + x2 = torch.randint(7, (2,)) + w2 = torch.randn(2, ) + embedding_matrix2 = torch.rand(12, 25) + offsets2 = torch.tensor([0, ], dtype=torch.long) + self.run_test(model, (embedding_matrix, x, w, offsets), + test_with_inputs=[(embedding_matrix2, x2, w2, offsets2)], + input_names=['embedding_matrix', 'x', 'offsets', 'w'], + dynamic_axes={'embedding_matrix': [0, 1], 'x': [0], 'offsets': [0], 'w': [0]}) + + class EmbeddingModel2D(torch.nn.Module): + def forward(self, embedding_matrix, input, weights): + return torch.nn.functional.embedding_bag(input, embedding_matrix, + mode='sum', per_sample_weights=weights) + + model = EmbeddingModel2D() + x = torch.randint(7, (2, 3)) + w = torch.randn(2, 3) + embedding_matrix = torch.rand(10, 15) + x2 = torch.randint(7, (3, 5)) + w2 = torch.randn(3, 5) + embedding_matrix2 = torch.rand(12, 25) + self.run_test(model, (embedding_matrix, x, w), + test_with_inputs=[(embedding_matrix2, x2, w2)], + input_names=['embedding_matrix', 'x', 'w'], + dynamic_axes={'embedding_matrix': [0, 1], 'x': [0, 1], 'w': [0, 1]}) + @skipIfUnsupportedMinOpsetVersion(8) - @disableScriptTest() def test_meshgrid(self): class Meshgrid(torch.nn.Module): def forward(self, x, y, z): - return torch.meshgrid(x, y, z) + output1, output2, output3 = torch.meshgrid(x, y, z) + return output1, output2, output3 x = torch.randn(3, requires_grad=True) y = torch.zeros(4, requires_grad=True) @@ -3457,11 +3599,11 @@ def forward(self, x, y, z): self.run_test(Meshgrid(), (x, y, z)) @skipIfUnsupportedMinOpsetVersion(8) - @disableScriptTest() def test_meshgrid_scalar(self): class Meshgrid(torch.nn.Module): def forward(self, x, y, z): - return torch.meshgrid(x, y, z) + output1, output2, output3 = torch.meshgrid(x, y, z) + return output1, output2, output3 x = torch.ones(3, requires_grad=True) y = torch.zeros(4, requires_grad=True) @@ -3532,7 +3674,6 @@ def forward(self, input, other): self.run_test(model, (x, y)) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() def test_ones_bool(self): class MyModule(torch.nn.Module): def forward(self, input): @@ -3579,7 +3720,7 @@ def test_constant_pad(self): # Dynamic padding is added in opset 11 @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @disableScriptTest() # Functional module not scriptable def test_pad_types(self): # Test for different pad integer types class Pad(torch.nn.Module): @@ -3613,7 +3754,7 @@ def run(): self.assertEqual('Unsupported: ONNX export of Pad in opset 9. The sizes of the padding must be constant. ' + 'Please try opset version 11.', the_exception.args[0]) - @disableScriptTest() + @disableScriptTest() # export prim::Uninitialized def test_reflection_pad(self): model = torch.nn.ReflectionPad1d(2) x = torch.randn(2, 4, 4) @@ -3623,7 +3764,7 @@ def test_reflection_pad(self): x = torch.randn(2, 2, 4, 4) self.run_test(model, x) - @disableScriptTest() + @disableScriptTest() # export prim::Uninitialized def test_replication_pad(self): model = torch.nn.ReplicationPad1d(2) x = torch.randn(2, 4, 4) @@ -3634,7 +3775,7 @@ def test_replication_pad(self): self.run_test(model, x) @skipIfUnsupportedMinOpsetVersion(11) - @disableScriptTest() + @disableScriptTest() # export prim::Uninitialized def test_im2col(self): class Unfold(torch.nn.Module): def forward(self, input): @@ -3658,7 +3799,6 @@ def forward(self, x): # This test checks output scalar type in the ONNX graph should not be null # https://github.com/pytorch/pytorch/issues/28607 @skipIfUnsupportedMinOpsetVersion(10) - @disableScriptTest() def test_trace_script(self): @torch.jit.script def center_slice_helper(input, h_offset): @@ -3688,13 +3828,14 @@ def forward(self, input): out = input * 2 out *= out.dim() return out + empty_input = torch.randn(0, requires_grad=True) multi_dim_input = torch.randn(1, 2, 3, requires_grad=True) self.run_test(DimModel(), empty_input) self.run_test(DimModel(), multi_dim_input) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # variable number of inputs not scriptable def test_einsum(self): class EinsumModelBatchDiagonal(torch.nn.Module): def forward(self, *tensor_list): @@ -3731,142 +3872,107 @@ def forward(self, *tensor_list): self.run_test(EinsumModelTranspose(), input=(x,)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_crossentropyloss(self): - x = torch.randn(3, 5) - y = torch.empty(3, dtype=torch.long).random_(5) - self._crossentropyloss(x, y) + for ignore_index in [-100, 1]: + x = torch.randn(3, 5) + y = torch.empty(3, dtype=torch.long).random_(5) + y[y == 1] = ignore_index - x = torch.randn(3, 5, 2) - y = torch.empty(3, 2, dtype=torch.long).random_(5) - self._crossentropyloss(x, y) + self._crossentropyloss(x, y, ignore_index) - x = torch.randn(3, 5, 2, 7) - y = torch.empty(3, 2, 7, dtype=torch.long).random_(5) - self._crossentropyloss(x, y) + x = torch.randn(3, 5, 2) + y = torch.empty(3, 2, dtype=torch.long).random_(5) + y[y == 1] = ignore_index + self._crossentropyloss(x, y, ignore_index) - def _crossentropyloss(self, x, y): + x = torch.randn(3, 5, 2, 7) + y = torch.empty(3, 2, 7, dtype=torch.long).random_(5) + y[y == 1] = ignore_index + self._crossentropyloss(x, y, ignore_index) + + def _crossentropyloss(self, x, y, ignore_index): class CrossEntropyLossNone(torch.nn.Module): - def __init__(self): + def __init__(self, ignore_index): super(CrossEntropyLossNone, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='none') + if ignore_index == -100: + self.loss = torch.nn.CrossEntropyLoss(reduction='none') + else: + self.loss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index) def forward(self, input, target): return self.loss(input, target) - self.run_test(CrossEntropyLossNone(), input=(x, y)) + self.run_test(CrossEntropyLossNone(ignore_index), input=(x, y)) class CrossEntropyLossNoneWeight(torch.nn.Module): - def __init__(self): + def __init__(self, ignore_index): super(CrossEntropyLossNoneWeight, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5)) + if ignore_index == -100: + self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5)) + else: + self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5), ignore_index=ignore_index) def forward(self, input, target): return self.loss(input, target) - self.run_test(CrossEntropyLossNoneWeight(), input=(x, y)) + self.run_test(CrossEntropyLossNoneWeight(ignore_index), input=(x, y)) class CrossEntropyLossSum(torch.nn.Module): - def __init__(self): + def __init__(self, ignore_index): super(CrossEntropyLossSum, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='sum') + if ignore_index == -100: + self.loss = torch.nn.CrossEntropyLoss(reduction='sum') + else: + self.loss = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=ignore_index) def forward(self, input, target): return self.loss(input, target) - self.run_test(CrossEntropyLossSum(), input=(x, y)) + self.run_test(CrossEntropyLossSum(ignore_index), input=(x, y)) class CrossEntropyLossSumWeight(torch.nn.Module): - def __init__(self): + def __init__(self, ignore_index): super(CrossEntropyLossSumWeight, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5)) + if ignore_index == -100: + self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5)) + else: + self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5), ignore_index=ignore_index) def forward(self, input, target): return self.loss(input, target) - self.run_test(CrossEntropyLossSumWeight(), input=(x, y)) + self.run_test(CrossEntropyLossSumWeight(ignore_index), input=(x, y)) class CrossEntropyLossMean(torch.nn.Module): - def __init__(self): + def __init__(self, ignore_index): super(CrossEntropyLossMean, self).__init__() - self.loss = torch.nn.CrossEntropyLoss() + if ignore_index == -100: + self.loss = torch.nn.CrossEntropyLoss() + else: + self.loss = torch.nn.CrossEntropyLoss(ignore_index=ignore_index) def forward(self, input, target): return self.loss(input, target) - self.run_test(CrossEntropyLossMean(), input=(x, y)) + self.run_test(CrossEntropyLossMean(ignore_index), input=(x, y)) class CrossEntropyLossMeanWeight(torch.nn.Module): - def __init__(self): + def __init__(self, ignore_index): super(CrossEntropyLossMeanWeight, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5)) - - def forward(self, input, target): - return self.loss(input, target) - - self.run_test(CrossEntropyLossMeanWeight(), input=(x, y)) - - class CrossEntropyLossNoneIgnoreIndex(torch.nn.Module): - def __init__(self): - super(CrossEntropyLossNoneIgnoreIndex, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=1) - - def forward(self, input, target): - return self.loss(input, target) - - self.run_test(CrossEntropyLossNoneIgnoreIndex(), input=(x, y)) - - class CrossEntropyLossNoneWeightIgnoreIndex(torch.nn.Module): - def __init__(self): - super(CrossEntropyLossNoneWeightIgnoreIndex, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='none', weight=torch.randn(5), ignore_index=1) - - def forward(self, input, target): - return self.loss(input, target) - - self.run_test(CrossEntropyLossNoneWeightIgnoreIndex(), input=(x, y)) - - class CrossEntropyLossSumIgnoreIndex(torch.nn.Module): - def __init__(self): - super(CrossEntropyLossSumIgnoreIndex, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=1) - - def forward(self, input, target): - return self.loss(input, target) - - self.run_test(CrossEntropyLossSumIgnoreIndex(), input=(x, y)) - - class CrossEntropyLossSumWeightIgnoreIndex(torch.nn.Module): - def __init__(self): - super(CrossEntropyLossSumWeightIgnoreIndex, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(reduction='sum', weight=torch.randn(5), ignore_index=1) - - def forward(self, input, target): - return self.loss(input, target) - - self.run_test(CrossEntropyLossSumWeightIgnoreIndex(), input=(x, y)) - - class CrossEntropyLossMeanIgnoreIndex(torch.nn.Module): - def __init__(self): - super(CrossEntropyLossMeanIgnoreIndex, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(ignore_index=1) + if ignore_index == -100: + self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5)) + else: + self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=ignore_index) def forward(self, input, target): return self.loss(input, target) - self.run_test(CrossEntropyLossMeanIgnoreIndex(), input=(x, y)) - - class CrossEntropyLossMeanWeightIgnoreIndex(torch.nn.Module): - def __init__(self): - super(CrossEntropyLossMeanWeightIgnoreIndex, self).__init__() - self.loss = torch.nn.CrossEntropyLoss(weight=torch.randn(5), ignore_index=1) - - def forward(self, input, target): - return self.loss(input, target) + self.run_test(CrossEntropyLossMeanWeight(ignore_index), input=(x, y)) - self.run_test(CrossEntropyLossMeanWeightIgnoreIndex(), input=(x, y)) @skipIfUnsupportedMinOpsetVersion(9) + @disableScriptTest() # Output dtype mismatch def test_kldiv_loss(self): x = torch.randn(5) @@ -3933,7 +4039,7 @@ def forward(self, input, target): self.run_test(KLDivLossMiniBatchMean(), input=(x, y)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -3948,10 +4054,13 @@ def forward(self, input, target): N, C = 5, 4 input = torch.randn(N, 16) target = torch.empty(N, dtype=torch.long).random_(0, C) + + # using test data containing default ignore_index=-100 + target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss_2d_none(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -3967,10 +4076,13 @@ def forward(self, input, target): N, C = 5, 4 input = torch.randn(N, 16, 10, 10) target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) + + # using test data containing default ignore_index=-100 + target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss_2d_mean(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -3986,10 +4098,13 @@ def forward(self, input, target): N, C = 5, 4 input = torch.randn(N, 16, 10, 10) target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) + + # using test data containing default ignore_index=-100 + target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss_2d_sum(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -4005,10 +4120,13 @@ def forward(self, input, target): N, C = 5, 4 input = torch.randn(N, 16, 10, 10) target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) + + # using test data containing default ignore_index=-100 + target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss_2d_mean_weights(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -4024,10 +4142,13 @@ def forward(self, input, target): N, C = 5, 4 input = torch.randn(N, 16, 10, 10) target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C) + + # using test data containing default ignore_index=-100 + target[target == 1] = -100 self.run_test(NLLModel(), (input, target)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss_2d_mean_ignore_index(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -4046,7 +4167,7 @@ def forward(self, input, target): self.run_test(NLLModel(), (input, target)) @skipIfUnsupportedMinOpsetVersion(12) - @disableScriptTest() + @disableScriptTest() # shape/type inference def test_nllloss_2d_mean_ignore_index_weights(self): class NLLModel(torch.nn.Module): def __init__(self): @@ -4188,6 +4309,7 @@ def forward(self, cond, input, other): self.run_test(Model(), (x, y, z)) @skipIfUnsupportedMinOpsetVersion(9) + @disableScriptTest() # symbolic update needed for unbind: ONNX export of unbind with dynamic number of outputs def test_where_condition(self): class Model1(torch.nn.Module): def forward(self, input): @@ -4217,6 +4339,7 @@ def forward(self, input): else: pass return out + x = torch.randn(1, 2, 3, requires_grad=True) self.run_test(EmptyBranchModel(), x) @@ -4243,6 +4366,7 @@ def __init__(self): def forward(self, x): return 2 * x + x = torch.randn(1, 2, 3, requires_grad=True) f = io.BytesIO() torch.onnx._export(Model(), x, f) @@ -4251,13 +4375,15 @@ def forward(self, x): def check_proto(): torch._C._check_onnx_proto(model.SerializeToString()) + self.assertRaises(RuntimeError, check_proto) - @disableScriptTest() + @disableScriptTest() # dtype mismatch def test_split_tensor_scalar(self): class SplitModel(torch.nn.Module): def forward(self, x): return torch.split(x, x.size(1)) + x = torch.randn(1, 2, 3, requires_grad=True) self.run_test(SplitModel(), x) @@ -4265,10 +4391,12 @@ def test_split_tensor_multi(self): class SplitModel(torch.nn.Module): def forward(self, x): return torch.split(x, torch.ones(3)) + x = torch.randn(1, 2, 3, requires_grad=True) def run_model(): SplitModel(x) + self.assertRaises(TypeError, run_model) def _dispatch_rnn_test(self, name, *args, **kwargs): @@ -4422,7 +4550,8 @@ def forward(self, x): model.train() - ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING) + ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING) ort_outs = run_ort(ort_sess, input=(x,)) assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0]))) @@ -4448,7 +4577,8 @@ def forward(self, x): model.train() - ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING) + ort_sess = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING) ort_outs = run_ort(ort_sess, input=(x,)) y = model(input) @@ -4476,11 +4606,14 @@ def forward(self, x): model = MyModule() x = torch.randn(10, 3, 128, 128) - ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING) + ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING) ort_outs1 = run_ort(ort_sess1, input=(x,)) - ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL) + ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, + training=torch.onnx.TrainingMode.EVAL) ort_outs2 = run_ort(ort_sess2, input=(x,)) - [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)] + [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in + zip(ort_outs1, ort_outs2)] def test_multiple_conv_bn(self): class MyModule(torch.nn.Module): @@ -4494,7 +4627,6 @@ def __init__(self): self.relu = torch.nn.ReLU(inplace=True) self.maxpool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - def forward(self, x): x = self.conv1(x) x = self.bn(x) @@ -4510,11 +4642,14 @@ def forward(self, x): model = MyModule() x = torch.randn(2, 3, 224, 224) - ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.TRAINING) + ort_sess1 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, + training=torch.onnx.TrainingMode.TRAINING) ort_outs1 = run_ort(ort_sess1, input=(x,)) - ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, training=torch.onnx.TrainingMode.EVAL) + ort_sess2 = convert_to_onnx(model, input=(x,), opset_version=self.opset_version, + training=torch.onnx.TrainingMode.EVAL) ort_outs2 = run_ort(ort_sess2, input=(x,)) - [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in zip(ort_outs1, ort_outs2)] + [np.testing.assert_allclose(ort_out1, ort_out2, atol=1e-7, rtol=0.001) for ort_out1, ort_out2 in + zip(ort_outs1, ort_outs2)] def make_test(name, base, layer, bidirectional, initial_state, variable_length, dropout, @@ -4527,7 +4662,7 @@ def make_test(name, base, layer, bidirectional, initial_state, # Cannot export with older opsets because of 'ConstantFill' op # ConstantFill was a temp op removed at opset 8. This is no longer supported by onnxruntime - @disableScriptTest() + @disableScriptTest() # Test code not scriptable @skipIfUnsupportedMinOpsetVersion(9) def f(self): self._dispatch_rnn_test( @@ -4542,7 +4677,6 @@ def f(self): f.__name__ = test_name setattr(TestONNXRuntime, f.__name__, f) - def setup_rnn_tests(): layers_opts = [ (1, 'unilayer'), @@ -4567,13 +4701,12 @@ def setup_rnn_tests(): ] test_count = 0 for (layer, bidirectional, initial_state, variable_length, dropout) in \ - itertools.product( - layers_opts, - bidirectional_opts, - initial_state_opts, - variable_length_opts, - dropout_opts, - ): + itertools.product( + layers_opts, + bidirectional_opts, + initial_state_opts, + variable_length_opts, + dropout_opts,): for base, name, extra_kwargs in ( ('elman', 'elman_relu', {'nonlinearity': u'relu'}), @@ -4594,7 +4727,6 @@ def setup_rnn_tests(): if test_count != 192: raise ValueError('Expected 192 tests but found {}'.format(test_count)) - setup_rnn_tests() @@ -4654,17 +4786,25 @@ def setup_rnn_tests(): dict(TestONNXRuntime.__dict__, opset_version=12, keep_initializers_as_inputs=False)) -# opset 9 tests, with use_new_jit_passes=True for using new jit API -TestONNXRuntime_opset9_new_jit_API = type(str("TestONNXRuntime_opset9_new_jit_API"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, - use_new_jit_passes=True)) - -# opset 12 tests, with use_new_jit_passes=True for using new jit API -TestONNXRuntime_opset12_new_jit_API = type(str("TestONNXRuntime_opset12_new_jit_API"), - (unittest.TestCase,), - dict(TestONNXRuntime.__dict__, opset_version=12, - use_new_jit_passes=True)) + +# opset 9 tests, with use_new_jit_passes=True for using new jit API, +# and with keep_initializers_as_inputs=False for IR version 4 style export. +TestONNXRuntime_opset9_IRv4_new_jit_API = type(str("TestONNXRuntime_opset9_IRv4_new_jit_API"), + (unittest.TestCase,), + dict(TestONNXRuntime.__dict__, + keep_initializers_as_inputs=False, + use_new_jit_passes=True, + onnx_shape_inference=True)) + + +# opset 12 tests, with use_new_jit_passes=True for using new jit API, +# and keep_initializers_as_inputs=False for IR version 4 style export. +TestONNXRuntime_opset12_IRv4_new_jit_API = type(str("TestONNXRuntime_opset12_IRv4_new_jit_API"), + (unittest.TestCase,), + dict(TestONNXRuntime.__dict__, opset_version=12, + keep_initializers_as_inputs=False, + use_new_jit_passes=True, + onnx_shape_inference=True)) # opset 12 tests, with _onnx_shape_inference=True. diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py new file mode 100644 index 000000000000..b0b56d9296c7 --- /dev/null +++ b/test/onnx/test_pytorch_onnx_shape_inference.py @@ -0,0 +1,78 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import unittest +import torch + +import copy + +import test_pytorch_onnx_onnxruntime +from test_pytorch_onnx_onnxruntime import TestONNXRuntime +from torch.onnx import utils, OperatorExportTypes, TrainingMode +from torch.onnx.utils import _validate_dynamic_axes +from torch.onnx.symbolic_helper import (_set_opset_version, _set_operator_export_type, + _set_onnx_shape_inference, _set_training_mode, + _is_tensor_list, _is_tensor, _is_none) + + +def verify_inferred_shape(graph): + # Check every node in graph has type properly assigned. + for n in graph.nodes(): + for out in n.outputs(): + if not _is_tensor_list(out) and not _is_tensor(out) and not _is_none(out): + raise RuntimeError("Output of node is neither type Tensor nor type list of Tensor: ", out) + if _is_tensor(out) and out.type().scalarType() is None: + raise RuntimeError("Output of node does not have type assigned", out) + if _is_tensor(out) and out.type().dim() is None: + raise RuntimeError("Output of node does not have shape assigned", out) + + +def run_model_test(self, model, batch_size=2, state_dict=None, + input=None, use_gpu=True, rtol=0.001, atol=1e-7, + example_outputs=None, do_constant_folding=True, + dynamic_axes=None, test_with_inputs=None, + input_names=None, output_names=None, + fixed_batch_size=False): + model.eval() + + if input is None: + input = torch.randn(batch_size, 3, 224, 224, requires_grad=True) + + with torch.no_grad(): + if isinstance(input, torch.Tensor): + input = (input,) + # In-place operators will update input tensor data as well. + # Thus inputs are replicated before every forward call. + input_copy = copy.deepcopy(input) + output = model(*input_copy) + if isinstance(output, torch.Tensor): + output = (output,) + + _set_opset_version(self.opset_version) + _set_operator_export_type(OperatorExportTypes.ONNX) + _set_onnx_shape_inference(True) + _set_training_mode(False) + if dynamic_axes is None: + dynamic_axes = {} + _validate_dynamic_axes(dynamic_axes, model, input_names, output_names) + + input_copy = copy.deepcopy(input) + graph, _, _ = utils._model_to_graph(model, input_copy, + input_names=input_names, + output_names=output_names, + operator_export_type=OperatorExportTypes.ONNX, + example_outputs=output, + do_constant_folding=do_constant_folding, + training=TrainingMode.EVAL, + use_new_jit_passes=self.use_new_jit_passes, + dynamic_axes=dynamic_axes) + verify_inferred_shape(graph) + + +if __name__ == '__main__': + TestONNXRuntime.opset_version = 12 + test_pytorch_onnx_onnxruntime.run_model_test = run_model_test + + unittest.main() diff --git a/test/package_a/__init__.py b/test/package_a/__init__.py new file mode 100644 index 000000000000..4761b3db5e41 --- /dev/null +++ b/test/package_a/__init__.py @@ -0,0 +1,7 @@ +result = 'package_a' + +class PackageAObject: + __slots__ = ['obj'] + + def __init__(self, obj): + self.obj = obj diff --git a/test/package_a/subpackage.py b/test/package_a/subpackage.py new file mode 100644 index 000000000000..46f729d51852 --- /dev/null +++ b/test/package_a/subpackage.py @@ -0,0 +1,3 @@ +result = 'package_a.subpackage' +class PackageASubpackageObject: + pass diff --git a/test/print_test_stats.py b/test/print_test_stats.py index 522e6652efe1..339f6800f61b 100755 --- a/test/print_test_stats.py +++ b/test/print_test_stats.py @@ -84,6 +84,7 @@ def build_message(test_case): "build_tag": os.environ.get("CIRCLE_TAG"), "build_sha1": os.environ.get("CIRCLE_SHA1"), "build_branch": os.environ.get("CIRCLE_BRANCH"), + "build_job": os.environ.get("CIRCLE_JOB"), "test_suite_name": test_case.class_name, "test_case_name": test_case.name, }, diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py index 91594da111c1..e54eb33770c2 100644 --- a/test/quantization/test_quantize.py +++ b/test/quantization/test_quantize.py @@ -14,6 +14,8 @@ fuse_modules, quantize_dynamic, QuantWrapper, + QuantStub, + DeQuantStub, QConfig, default_qconfig, default_qat_qconfig, @@ -21,6 +23,8 @@ per_channel_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_dynamic_qconfig, + register_observed_custom_module_mapping, + register_quantized_custom_module_mapping, ) from torch.testing._internal.common_quantization import ( @@ -571,6 +575,115 @@ def forward(self, indices, offsets, per_sample_weights, linear_in): self.checkLinear(model.fc) self.checkDynamicQuantizedModule(quantized_model.emb, torch.nn.quantized.EmbeddingBag, torch.quint8) + @skipIfNoFBGEMM + def test_custom_module_class(self): + class CustomModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + return self.conv(x) + + class ObservedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_float(cls, float_module): + assert hasattr(float_module, 'qconfig') + observed = cls(float_module.conv) + observed.qconfig = float_module.qconfig + return observed + + class QuantizedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + observed_module.conv.activation_post_process = \ + observed_module.activation_post_process + quantized = cls(nnq.Conv2d.from_float(observed_module.conv)) + return quantized + + register_observed_custom_module_mapping(CustomModule, ObservedCustomModule) + register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.custom = CustomModule() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.custom(x) + x = self.dequant(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = QuantStub() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv1(x) + x = self.conv2(x) + x = self.dequant(x) + return x + + data = torch.randn(1, 1, 1, 1) + # instantiate M and RefM and align the parameters + original_m = M() + original_ref_m = RefM() + original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach()) + original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach()) + + original_m.qconfig = default_qconfig + m = prepare(original_m) + self.checkObservers(m) + # calibration + m(data) + # all activation observers are inserted in the top level module + + # check converted/quantized model + m = convert(m) + # check if the module is properly quantized + self.assertEqual(type(m.quant), nnq.Quantize) + self.assertEqual(type(m.conv), nnq.Conv2d) + self.assertEqual(type(m.custom.conv), nnq.Conv2d) + self.assertEqual(type(m.dequant), nnq.DeQuantize) + res = m(data) + + # quantize the reference model + original_ref_m.eval() + original_ref_m.qconfig = default_qconfig + ref_m = prepare(original_ref_m) + ref_m(data) + ref_m = convert(ref_m) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + @skipIfNoFBGEMM class TestPostTrainingDynamic(QuantizationTestCase): diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 3170bfbfe8b4..53551efb7c0f 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -9,17 +9,16 @@ # symbolic trace from torch.fx import symbolic_trace +from torch.fx.symbolic_trace import Tracer + # graph mode quantization based on fx from torch.quantization import ( QuantType, - fuse_fx, prepare_fx, convert_fx, - prepare_static_fx, - convert_static_fx, - quantize_static_fx, - quantize_dynamic_fx, prepare_qat_fx, + register_observed_custom_module_mapping, + register_quantized_custom_module_mapping, ) from torch.quantization import ( @@ -40,6 +39,7 @@ skip_if_no_torchvision, train_one_epoch, run_ddp, + LinearModelWithSubmodule, ) from torch.testing._internal.common_quantized import ( @@ -58,7 +58,9 @@ import itertools import operator import unittest +import io +@skipIfNoFBGEMM class TestQuantizeFx(QuantizationTestCase): def _get_conv_linear_test_cases(self): ''' Returns a list of test cases, with format: @@ -151,11 +153,11 @@ def test_functional_debug(self): quant_type = QuantType.DYNAMIC if is_dynamic else QuantType.STATIC node_occurrence = dict() if weight_prepack_node: - node_occurrence[weight_prepack_node] = 1 + node_occurrence[weight_prepack_node] = 0 + node_occurrence[quantized_node] = 0 self.checkGraphModeFxOp( ModuleClass(*module_constructor_inputs), inputs, quant_type, - expected_node=quantized_node, expected_node_occurrence=node_occurrence, debug=True) @@ -176,7 +178,8 @@ def forward(self, x): original = symbolic_trace(m) qconfig = default_dynamic_qconfig qconfig_dict = {'': qconfig} - quantized = quantize_dynamic_fx(original, qconfig_dict, debug=True) + prepared = prepare_fx(original, qconfig_dict) + quantized = convert_fx(prepared, debug=True) qparams = (quantized._scale_0, quantized._zero_point_0) weight_obs = qconfig.weight() weight_obs(quantized.weight) @@ -219,14 +222,12 @@ def forward(self, x): for debug in [True, False]: node_occurrence = dict() if weight_prepack_node: - if debug: - node_occurrence[weight_prepack_node] = 1 - else: - node_occurrence[weight_prepack_node] = 0 + node_occurrence[weight_prepack_node] = 0 m = ModuleClass(*module_constructor_inputs).eval() m = symbolic_trace(m) qconfig_dict = {"": float16_dynamic_qconfig} - m = quantize_dynamic_fx(m, qconfig_dict, debug=debug) + m = prepare_fx(m, qconfig_dict) + m = convert_fx(m, debug=debug) self.checkGraphModuleNodes(m, expected_node_occurrence=node_occurrence) @@ -262,8 +263,7 @@ def forward(self, x): model = symbolic_trace(model) # QAT prepare - model = fuse_fx(model) - model = prepare_fx(model, qconfig_dict) + model = prepare_qat_fx(model, qconfig_dict) # ensure that running an input on CUDA works without any needed changes input = torch.randn(4, 1, 4, 4, device=device) @@ -286,13 +286,19 @@ def __init__(self): def forward(self, x): return self.conv(x) - model = symbolic_trace(M().eval()) + model = M().eval() + model = symbolic_trace(model) qconfig_dict = {'': default_qconfig} - non_inplace_model = quantize_static_fx( - model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=False) - inplace_model = model - inplace_model = quantize_static_fx( - inplace_model, qconfig_dict, test_only_eval_fn, [self.img_data_2d], inplace=True) + prepared = prepare_fx( + model, qconfig_dict, inplace=False) + test_only_eval_fn(model, self.img_data_2d) + non_inplace_model = convert_fx(prepared, inplace=True) + + prepared = prepare_fx( + model, qconfig_dict, inplace=True) + test_only_eval_fn(model, self.img_data_2d) + inplace_model = convert_fx(prepared, inplace=True) + non_inplace_res = non_inplace_model(self.img_data_2d[0][0]) inplace_res = inplace_model(self.img_data_2d[0][0]) self.assertEqual(non_inplace_res, inplace_res) @@ -312,11 +318,101 @@ def forward(self, x): dict_input = {"input": torch.randn(1, 1, 1, 1)} m = symbolic_trace(M()).eval() qconfig_dict = {"": default_qconfig} - m = prepare_static_fx(m, qconfig_dict) + m = prepare_fx(m, qconfig_dict) m(dict_input) - m = convert_static_fx(m) + m = convert_fx(m) m(dict_input) + def test_standalone_module_class(self): + class StandaloneModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + return self.conv(x) + + class CustomTracer(Tracer): + def is_leaf_module(self, m, module_qualified_name): + return (m.__module__.startswith('torch.nn') and + not isinstance(m, torch.nn.Sequential)) or \ + isinstance(m, StandaloneModule) + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.standalone = StandaloneModule() + + def forward(self, x): + x = self.conv(x) + x = self.standalone(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + data = torch.randn(1, 1, 1, 1) + # instantiate M and RefM and align the parameters + original_m = M() + original_ref_m = RefM() + original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach()) + original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach()) + + m = torch.fx.GraphModule(original_m, CustomTracer().trace(original_m)).eval() + qconfig_dict = {'': default_qconfig, 'standalone_module_name': ['standalone']} + # check prepared model + m = prepare_fx(m, qconfig_dict) + # calibration + m(data) + # input and output of first conv, observer for standalone module + # will be inserted in the standalone module itself + count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + # for output of conv in the standalone module + count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 1 + } + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + + # check converted/quantized model + m = convert_fx(m) + count_check = { + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + ns.call_method('dequantize') : 1, + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + count_check = { + # quantization of input happens in parent module + # quantization of output happens in the quantized conv module + ns.call_function(torch.quantize_per_tensor) : 0, + # dequantization for output happens in parent module + ns.call_method('dequantize') : 0, + } + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + res = m(data) + + # quantize the reference model + ref_m = symbolic_trace(original_ref_m).eval() + ref_m = prepare_fx(ref_m, qconfig_dict) + ref_m(data) + ref_m = convert_fx(ref_m) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + @skipIfNoFBGEMM def test_qconfig_none(self): class M(torch.nn.Module): @@ -332,21 +428,142 @@ def forward(self, x): m = M().eval() m = symbolic_trace(m) - qconfig_dict = {'': default_qconfig, 'conv2': None} - m = prepare_static_fx(m, qconfig_dict) + qconfig_dict = {"": default_qconfig, + "module_name": [("conv2", None)]} + m = prepare_fx(m, qconfig_dict) data = torch.randn(1, 1, 1, 1) m(data) - m = convert_static_fx(m) + m = convert_fx(m) m(data) # first conv is quantized, second conv is not quantized node_list = [ ns.call_function(torch.quantize_per_tensor), ns.call_module(nnq.Conv2d), - ns.call_method('dequantize'), + ns.call_method("dequantize"), ns.call_module(nn.Conv2d), ] self.checkGraphModuleNodes(m, expected_node_list=node_list) + def test_qconfig_module_type(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(1, 1, 1) + self.conv2 = nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + m = M().eval() + m = symbolic_trace(m) + qconfig_dict = {"object_type": [(torch.nn.Conv2d, default_qconfig)]} + m = prepare_fx(m, qconfig_dict) + data = torch.randn(1, 1, 1, 1) + m(data) + m = convert_fx(m) + m(data) + # first conv is quantized, second conv is not quantized + node_list = [ + ns.call_function(torch.quantize_per_tensor), + ns.call_module(nnq.Conv2d), + ns.call_module(nnq.Conv2d), + ns.call_method("dequantize"), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + def test_qconfig_function(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + + def forward(self, x, y): + return x + y + + m = M().eval() + m = symbolic_trace(m) + qconfig_dict = {"object_type": [(operator.add, default_qconfig)]} + m = prepare_fx(m, qconfig_dict) + data = torch.randn(1, 1, 1, 1) + m(data, data) + m = convert_fx(m) + m(data, data) + # first conv is quantized, second conv is not quantized + node_list = [ + ns.call_function(torch.quantize_per_tensor), + ns.call_function(torch.ops.quantized.add), + ns.call_method("dequantize"), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + def test_qconfig_module_name_regex(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.conv1 = nn.Conv2d(1, 1, 1) + self.conv2 = nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + m = M().eval() + m = symbolic_trace(m) + qconfig_dict = {"module_name_regex": [("conv*", default_qconfig)]} + m = prepare_fx(m, qconfig_dict) + data = torch.randn(1, 1, 1, 1) + m(data) + m = convert_fx(m) + m(data) + # first conv is quantized, second conv is not quantized + node_list = [ + ns.call_function(torch.quantize_per_tensor), + ns.call_module(nnq.Conv2d), + ns.call_module(nnq.Conv2d), + ns.call_method("dequantize"), + ] + self.checkGraphModuleNodes(m, expected_node_list=node_list) + + def test_qconfig_precedence(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.linear = nn.Linear(1, 1) + self.conv = nn.Conv2d(1, 1, 1) + self.module_conv1 = nn.Conv2d(1, 1, 1) + self.module_conv2 = nn.Conv2d(1, 1, 1) + + def forward(self, x): + # global + x = self.linear(x) + # global + object_type --> object_type + x = self.conv(x) + # global + object_type + module_name_regex --> module_name_regex + x = self.module_conv1(x) + # global + object_type + module_name_regex + module_name --> module_name + x = self.module_conv2(x) + return x + + m = M().eval() + m = symbolic_trace(m) + global_qconfig = default_qconfig + object_type_qconfig = default_dynamic_qconfig + module_name_regex_qconfig = float16_dynamic_qconfig + module_name_qconfig = default_qat_qconfig + qconfig_dict = { + "": global_qconfig, + "object_type": [(nn.Conv2d, object_type_qconfig)], + "module_name_regex": [("module_conv*", module_name_regex_qconfig)], + "module_name": [("module_conv2", module_name_qconfig)]} + m = prepare_fx(m, qconfig_dict) + self.assertEqual(m.linear.qconfig, global_qconfig) + self.assertEqual(m.conv.qconfig, object_type_qconfig) + self.assertEqual(m.module_conv1.qconfig, module_name_regex_qconfig) + self.assertEqual(m.module_conv2.qconfig, module_name_qconfig) + + def test_remove_qconfig(self): class M(torch.nn.Module): def __init__(self): @@ -359,10 +576,10 @@ def forward(self, x): m = M().eval() m = symbolic_trace(m) qconfig_dict = {'': default_qconfig} - m = prepare_static_fx(m, qconfig_dict) + m = prepare_fx(m, qconfig_dict) data = torch.randn(1, 1, 1, 1) m(data) - m = convert_static_fx(m) + m = convert_fx(m) m(data) for name, module in m.named_modules(): self.assertFalse(hasattr(module, 'qconfig'), @@ -370,28 +587,8 @@ def forward(self, x): @skipIfNoFBGEMM def test_qat_and_script(self): - class TwoLayerLinear(nn.Module): - def __init__(self): - super(TwoLayerLinear, self).__init__() - self.fc1 = nn.Linear(5, 5) - self.fc2 = nn.Linear(5, 5) - - def forward(self, x): - x = self.fc1(x) - return self.fc2(x) - class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - self.subm = TwoLayerLinear() - self.fc = nn.Linear(5, 5) - - def forward(self, x): - x = self.subm(x) - x = self.fc(x) - return x - - model = Model() + model = LinearModelWithSubmodule() qengine = torch.backends.quantized.engine qconfig_dict = {'': torch.quantization.get_default_qat_qconfig(qengine)} @@ -429,59 +626,172 @@ def forward(self, x): @skipIfNoFBGEMM def test_save_observer_state_dict(self): - class TwoLayerLinear(nn.Module): - def __init__(self): - super(TwoLayerLinear, self).__init__() - self.fc1 = nn.Linear(5, 5) - self.fc2 = nn.Linear(5, 5) - - def forward(self, x): - x = self.fc1(x) - return self.fc2(x) - - class Model(nn.Module): - def __init__(self): - super(Model, self).__init__() - self.subm = TwoLayerLinear() - self.fc = nn.Linear(5, 5) - - def forward(self, x): - x = self.subm(x) - x = self.fc(x) - return x - - model = Model().eval() + orig = LinearModelWithSubmodule().eval() + model = orig qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')} - # symbolically trace model = symbolic_trace(model) - model = prepare_static_fx(model, qconfig_dict) + model = prepare_fx(model, qconfig_dict) # run it through input x = torch.randn(5, 5) model(x) - quant = convert_static_fx(model) + quant = convert_fx(model) # save state_dict of model - import io + obs_dict = torch.quantization.get_observer_state_dict(model) b = io.BytesIO() - torch.save(model.state_dict(), b) + torch.save(obs_dict, b) b.seek(0) # Load the stats into new model - model_2 = Model().eval() + model_2 = orig model_2 = symbolic_trace(model_2) - model_2 = prepare_static_fx(model_2, qconfig_dict) + model_2 = prepare_fx(model_2, qconfig_dict) loaded_dict = torch.load(b) - model_2.load_state_dict(loaded_dict) + torch.quantization.load_observer_state_dict(model_2, loaded_dict) - quant_2 = convert_static_fx(model_2) + quant_2 = convert_fx(model_2) # Verify that loaded state dict produces same results. self.assertEqual(quant(x), quant_2(x)) + @skipIfNoFBGEMM + def test_custom_module_class(self): + class CustomModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + return self.conv(x) + + class ObservedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_float(cls, float_module): + assert hasattr(float_module, 'qconfig') + observed = cls(float_module.conv) + observed.qconfig = float_module.qconfig + return observed + + class QuantizedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + observed_module.conv.activation_post_process = \ + observed_module.activation_post_process + quantized = cls(nnq.Conv2d.from_float(observed_module.conv)) + return quantized + + class DynamicallyQuantizedCustomModule(torch.nn.Module): + def __init__(self, conv): + super().__init__() + self.conv = conv + + def forward(self, x): + return self.conv(x) + + @classmethod + def from_observed(cls, observed_module): + assert hasattr(observed_module, 'qconfig') + assert hasattr(observed_module, 'activation_post_process') + quantized = cls(nnqd.Conv2d.from_float(observed_module.conv)) + return quantized + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.custom = CustomModule() + + def forward(self, x): + x = self.conv(x) + x = self.custom(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + data = torch.randn(1, 1, 1, 1) + # instantiate M and RefM and align the parameters + original_m = M() + original_ref_m = RefM() + original_ref_m.conv1.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv1.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + original_ref_m.conv2.weight = torch.nn.Parameter(original_m.custom.conv.weight.detach()) + original_ref_m.conv2.bias = torch.nn.Parameter(original_m.custom.conv.bias.detach()) + + from torch.fx.symbolic_trace import Tracer + + # define a custom tracer to not trace through the custom module + + class CustomTracer(Tracer): + def is_leaf_module(self, m, module_qualified_name): + return (m.__module__.startswith('torch.nn') and + not isinstance(m, torch.nn.Sequential)) or \ + isinstance(m, CustomModule) + + # TODO: add other quant types after mixed mode support + for quant_type in [QuantType.STATIC]: + # register observed and quantized custom module classes + register_observed_custom_module_mapping(CustomModule, ObservedCustomModule) + register_quantized_custom_module_mapping(CustomModule, QuantizedCustomModule) + + m = torch.fx.GraphModule(original_m, CustomTracer().trace(original_m)).eval() + qconfig_dict = {'': default_qconfig} + # check prepared model + m = prepare_fx(m, qconfig_dict) + # calibration + m(data) + # all activation observers are inserted in the top level module + count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 3 + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + + # check converted/quantized model + m = convert_fx(m) + count_check = { + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + ns.call_method('dequantize') : 1, + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + res = m(data) + + # quantize the reference model + ref_m = symbolic_trace(original_ref_m).eval() + ref_m = prepare_fx(ref_m, qconfig_dict) + ref_m(data) + ref_m = convert_fx(ref_m) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops """ @@ -1037,7 +1347,7 @@ def forward(self, x): data = torch.rand(1, 3, 10, 10) # This model is not executable since we just put all ops # in the same forward - m = M() + m = M().eval() original = symbolic_trace(m) # nothing to fuse so skipping the fuse step qconfig_dict = {'': default_qconfig} @@ -1132,7 +1442,7 @@ def forward(self, x): # This model is not executable since we just put all ops # in the same forward - m = M() + m = M().eval() original = symbolic_trace(m) # nothing to fuse so skipping the fuse step qconfig_dict = {'': default_qconfig} @@ -1201,7 +1511,6 @@ def _test_model_impl( if mode != 'static': model.train() - graph_module = fuse_fx(graph_module) prepared = prepare_fx(graph_module, qconfig_dict) if mode == 'ddp': diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py index 6d94919eee1f..a0fad9b80e89 100644 --- a/test/quantization/test_quantize_jit.py +++ b/test/quantization/test_quantize_jit.py @@ -51,6 +51,7 @@ SkipQuantModel, NestedModel, ConvModel, + ConvTransposeModel, default_per_channel_qconfig, test_only_eval_fn, ConvBnModel, @@ -61,6 +62,7 @@ AnnotatedSkipQuantModel, AnnotatedNestedModel, AnnotatedConvModel, + AnnotatedConvTransposeModel, AnnotatedConvBnModel, ) @@ -74,6 +76,7 @@ # Standard library import itertools import unittest +import io class TestQuantizeJitPasses(QuantizationTestCase): """ Test graph mode quantization passes used by quantize_jit @@ -1361,6 +1364,52 @@ def forward(self, x, y): FileCheck().check("quantized::embedding_bag_byte_rowwise_offsets") \ .run(m.graph) + @skipIfNoFBGEMM + def test_quantize_fork_wait(self): + """ Tests the case where fork and wait calls are in different subgraphs + Calling inline fork-wait only removes the fork call and leaves aten::wait + calls in the graph, with Tensor as input (instead of Future[Tensor]) + """ + class MainModule(nn.Module): + def __init__(self): + super(MainModule, self).__init__() + self.fork_ops = ForkModule() + + def init_values(self, x): + shared_module = self.fork_ops(x) + self.fork_dict = shared_module + + def forward(self, x): + val = torch.jit._wait(self.fork_ops(x)) + return val + + class TestModule(torch.nn.Module): + def __init__(self): + super(TestModule, self).__init__() + + def forward(self, x): + w = torch.ones(5, 5) + b = torch.zeros(5) + return torch.nn.functional.linear(x, w, b) + + class ForkModule(nn.Module): + def __init__(self): + super(ForkModule, self).__init__() + self.test = TestModule() + + def forward(self, x): + fut = torch.jit._fork(self.test.forward, x) + return fut + + model = MainModule().eval() + traced = torch.jit.trace(model, (torch.randn(5, 5),)) + model = prepare_dynamic_jit(traced, {'' : default_qconfig}) + model = convert_dynamic_jit(model) + FileCheck().check("quantized::linear_dynamic") \ + .run(model.graph) + # Make sure model save works + b = io.BytesIO() + torch.jit.save(model, b) class TestQuantizeJitOps(QuantizationTestCase): """ Test graph mode post training static quantization works @@ -2683,11 +2732,11 @@ def forward(self, x): else: # for input of FC for dynamic quant assert len(attrs_with_prefix(m, '_observer_')) == 1 - observer_name = 'DynamicQuantObserver = prim::GetAttr[name="_observer_' + observer_name = 'Observer = prim::GetAttr[name="_observer_' FileCheck().check(observer_name) \ .check('prim::GetAttr[name="fc"]') \ .check('prim::CallMethod') \ - .check_not('Observer = prim::GetAttr[name="_observer_') \ + .check_not(observer_name) \ .run(m.graph) @@ -2723,7 +2772,7 @@ def forward(self, x): assert len(attrs_with_prefix(m.sub.fc, '_observer_')) == 1 FileCheck().check('prim::GetAttr[name="sub') \ .check('prim::CallMethod') \ - .check('DynamicQuantObserver = prim::GetAttr[name="_observer_') \ + .check('Observer = prim::GetAttr[name="_observer_') \ .check('prim::CallMethod') \ .check_not('Observer = prim::GetAttr[name="_observer_') \ .run(m.graph) @@ -3124,6 +3173,35 @@ def test_conv(self): inplace=False) self.assertEqual(model_quantized(self.img_data_2d[0][0]), result_eager) + @override_qengines + def test_conv_transpose(self): + r"""Compare the result of quantizing conv_transpose layer in + eager mode and graph mode + """ + if not qengine_is_qnnpack(): + return # Currently only qnnpack is supported + # eager mode + annotated_conv_model = AnnotatedConvTransposeModel( + torch.backends.quantized.engine).eval() + conv_model = ConvTransposeModel().eval() + # copy the weight from eager mode so that we can + # compare the result of the two quantized models later + conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach()) + model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d) + qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)} + model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0]) + model_script = torch.jit.script(conv_model) + result_eager = model_eager(self.img_data_2d[0][0]) + for model_under_test in [model_traced, model_script]: + model_quantized = quantize_jit( + model_under_test, + qconfig_dict, + test_only_eval_fn, + [self.img_data_2d], + inplace=False) + self.assertEqual(model_quantized(self.img_data_2d[0][0]), + result_eager) + @override_qengines def test_conv_bn(self): r"""Compare the result of quantizing conv + bn layer in diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py index 3a0e6f10bf33..ceef43dca51c 100644 --- a/test/quantization/test_quantized_op.py +++ b/test/quantization/test_quantized_op.py @@ -119,7 +119,6 @@ def _get_random_tensor_and_q_params(shapes, rand_scale, torch_type): X_scale = 1e-10 return X, X_scale, X_zero_point - class TestQuantizedOps(TestCase): """Helper function to test quantized activation functions.""" @@ -141,17 +140,17 @@ def _test_activation_function(self, X, fn_name, test_configs): quantized_fn: a list of the quantized functions to be tested reference_fn: the original reference function to be called on the the dequantized X - inplace_kwarg: the additional inplace keyword argument to test in-place + extra_kwargs: the additional keyword arguments for each test entry in ops_under_test, it must have at least the fields - for quantized_fn and reference_fn. If inplace_kwarg is missing, the - quantized function is assumed to be either inplace by default or the - test is not testing an inplace function. + for quantized_fn and reference_fn. output_range: the output range the operator will map to. By default, if it is no specified, the range will not be controlled and depend on Xmin and Xmax. change_zero_point: a boolean flag indicating if the zero point parameter should be determined based on torch_type during quantization (see sigmoid/hardsigmoid for examples). By default, if it is not specified, change_zero_point is assumed to be False and zero point will just take on the default value from X. + `output_is_observed`: if specified and is True, we'll append extra + output_scale/output_zero_point keyword argument when calling quantized op """ # Retrives the default parameters from X. X, (scale, zero_point, torch_type) = X @@ -163,15 +162,15 @@ def _test_activation_function(self, X, fn_name, test_configs): for op_group in test_configs: ref_op = op_group['reference_fn'] for q_op in op_group['quantized_fn']: + # Retrieves the inplace keyword arguments + # some functions require inplace=True to test in-place. + extra_kwargs = op_group.get('extra_kwargs', dict()) + output_is_observed = op_group.get('output_is_observed', False) # Quantizes and dequantizes to account for max error. qX = torch.quantize_per_tensor(X, scale=scale, zero_point=zero_point, dtype=torch_type) dqX = qX.dequantize() - dqY_hat = ref_op(dqX.clone()) - - # Retrieves the inplace keyword arguments - # some functions require inplace=True to test in-place. - inplace_kwarg = op_group.get('inplace_kwarg', dict()) + dqY_hat = ref_op(dqX.clone(), **extra_kwargs) # Adjusts output_scale if needed. # The output_scale determines the quantization scale for functions that @@ -195,8 +194,11 @@ def _test_activation_function(self, X, fn_name, test_configs): zero_point=output_zero_point, dtype=torch_type) + if output_is_observed: + extra_kwargs.update({'output_scale': scale, 'output_zero_point': zero_point}) + # Finds qY using in-place or non-in-place quantized operators. - qY = q_op(qX, **inplace_kwarg) + qY = q_op(qX, **extra_kwargs) self.assertEqual(qY, qY_hat, msg='{} - {} failed: ({} vs. {})'.format( fn_name, q_op, qY, qY_hat @@ -223,7 +225,7 @@ def test_qrelu(self, X): torch.nn.quantized.functional.relu, ], 'reference_fn': torch.nn.functional.relu, - 'inplace_kwarg': { + 'extra_kwargs': { 'inplace': True } } @@ -281,11 +283,30 @@ def test_qhardsigmoid(self, X): ] self._test_activation_function(X, 'hardsigmoid', hardsigmoid_test_configs) + @override_qengines + @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5), + qparams=hu.qparams())) + def test_leaky_relu_observed_output(self, X): + leaky_relu_test_configs = [ + { + 'quantized_fn': [ + torch.ops.quantized.leaky_relu + ], + 'reference_fn': torch.nn.functional.leaky_relu, + 'extra_kwargs': { + 'negative_slope': 0.1, + 'inplace': False, + }, + 'output_is_observed': True, + } + ] + self._test_activation_function(X, 'leaky_relu', leaky_relu_test_configs) + """Tests the correctness of the quantized::relu op.""" @given(X=hu.tensor(shapes=hu.array_shapes(1, 5, 1, 5), qparams=hu.qparams()), alpha=st.floats(0.0, 1.0, allow_nan=False, allow_infinity=False)) - def test_qrelu_leaky(self, X, alpha): + def test_leaky_relu(self, X, alpha): X, (scale, zero_point, torch_type) = X X = torch.from_numpy(X) @@ -907,7 +928,56 @@ def test_channel_shuffle(self, X, groups): self.assertEqual(a_ref, a_hat.dequantize(), msg="torch.nn.functional.channel_shuffle results are off") - """Tests max pool operation on quantized tensors.""" + """Tests 1D max pool operation on quantized tensors.""" + @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=2, max_dims=3, + min_side=1, max_side=10), + qparams=hu.qparams()), + kernel=st.sampled_from((3, 5, 7)), + stride=st.sampled_from((None, 1, 2)), + dilation=st.integers(1, 2), + padding=st.integers(0, 2), + ceil_mode=st.booleans()) + def test_max_pool1d(self, X, kernel, stride, dilation, padding, ceil_mode): + X, (scale, zero_point, torch_type) = X + # Check constraints + assume(kernel // 2 >= padding) # Kernel cannot be overhanging! + iW = X.shape[-1] + oW = pool_output_shape(iW, kernel, padding, stride, dilation, ceil_mode) + assume(oW > 0) + + a = torch.from_numpy(X) + a_pool = torch.nn.functional.max_pool1d(a, kernel_size=kernel, + stride=stride, + padding=padding, + dilation=dilation, + ceil_mode=ceil_mode) + a_ref = torch.quantize_per_tensor(a_pool, scale=scale, + zero_point=zero_point, dtype=torch_type) + a_ref = a_ref.dequantize() + qa = torch.quantize_per_tensor(a, scale=scale, zero_point=zero_point, + dtype=torch_type) + + ops_under_test = { + "torch": torch.max_pool1d, + "nn.functional": torch.nn.functional.max_pool1d, + "nn.quantized.functional": torch.nn.quantized.functional.max_pool1d + } + + for name, op in ops_under_test.items(): + a_hat = op(qa, kernel_size=kernel, stride=stride, padding=padding, + dilation=dilation, ceil_mode=ceil_mode) + self.assertEqual(a_ref, a_hat.dequantize(), + msg="{} results are off".format(name)) + # Test the ops.quantized separately, because None is not treated. + a_hat = torch.ops.quantized.max_pool1d( + qa, kernel_size=_single(kernel), + stride=_single(kernel if stride is None else stride), + padding=_single(padding), dilation=_single(dilation), + ceil_mode=ceil_mode) + self.assertEqual(a_ref, a_hat.dequantize(), + msg="ops.quantized.max_pool1d results are off") + + """Tests 2D max pool operation on quantized tensors.""" @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=4, min_side=1, max_side=10), qparams=hu.qparams()), @@ -1678,12 +1748,14 @@ def test_cat_nhwc(self, X, relu): torch.testing.assert_allclose(out.dequantize(), ref.dequantize()) self.assertNotEqual(out.stride(), sorted(out.stride())) - @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=3, max_dims=3, - min_side=1, max_side=2), + @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=1, max_dims=5, + min_side=1, max_side=4), qparams=hu.qparams()), - dim=st.integers(1, 2)) + dim=st.integers(-1, 5)) + @override_qengines def test_mean(self, X, dim): X, (scale, zero_point, torch_type) = X + assume(dim < X.ndim) qX = torch.quantize_per_tensor(torch.tensor(X).float(), scale, zero_point, torch_type) Y = torch.mean(qX.dequantize(), dim) @@ -2718,11 +2790,14 @@ def test_qlinear_unpack(self, W, use_channelwise): @unittest.skipIf(sys.platform == "darwin", "Known test failure on Mac.") class TestQuantizedEmbeddingOps(TestCase): - def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate): + def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate, optimized_qparams): weights = torch.from_numpy((np.random.random_sample(( num_embeddings, embedding_dim)) + 1).astype(np.float32)) - w_packed = pack_fn(weights) + if bit_rate == 8: + w_packed = pack_fn(weights) + else: + w_packed = pack_fn(weights, optimized_qparams=optimized_qparams) w_unpacked = unpack_fn(w_packed) if bit_rate == 8: @@ -2753,13 +2828,13 @@ def _test_embedding_bag_unpack_fn(self, pack_fn, unpack_fn, num_embeddings, embe conversion_op = "FloatToFused2BitRowwiseQuantized" reverse_conversion_op = "Fused2BitRowwiseQuantizedToFloat" - def get_c2_weights(weights): + def get_c2_weights(weights, engine_str): workspace.ResetWorkspace() workspace.FeedBlob("weights", weights) workspace.RunOperatorOnce( core.CreateOperator( - conversion_op, ["weights"], ["quantized_weights"] + conversion_op, ["weights"], ["quantized_weights"], engine=engine_str ) ) emb_q = workspace.FetchBlob("quantized_weights") @@ -2776,12 +2851,16 @@ def get_c2_weights(weights): ) return torch.from_numpy(emb_q), dequantized_data - w_packed_c2, w_unpacked_c2 = get_c2_weights(weights) + if optimized_qparams: + engine = "GREEDY" + else: + engine = "" + w_packed_c2, w_unpacked_c2 = get_c2_weights(weights, engine) # Compare packed weights against C2. - np.testing.assert_equal(w_packed.numpy(), w_packed_c2.numpy()) + np.testing.assert_allclose(w_packed.numpy(), w_packed_c2.numpy(), atol=1e-6, rtol=1e-6) # Compare unpacked weights against C2 - np.testing.assert_equal(w_unpacked.numpy(), w_unpacked_c2.numpy()) + np.testing.assert_allclose(w_unpacked.numpy(), w_unpacked_c2.numpy(), atol=1e-6, rtol=1e-6) """ Tests the correctness of the embedding_bag_8bit pack/unpack op against C2 """ @given(num_embeddings=st.integers(10, 100), @@ -2790,25 +2869,27 @@ def test_embedding_bag_byte_unpack(self, num_embeddings, embedding_dim): pack_fn = torch.ops.quantized.embedding_bag_byte_prepack unpack_fn = torch.ops.quantized.embedding_bag_byte_unpack - self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=8) + self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 8, False) """ Tests the correctness of the embedding_bag_4bit pack/unpack op against C2 """ @given(num_embeddings=st.integers(10, 100), - embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0),) - def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim): + embedding_dim=st.integers(5, 50).filter(lambda x: x % 4 == 0), + optimized_qparams=st.booleans(),) + def test_embedding_bag_4bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams): pack_fn = torch.ops.quantized.embedding_bag_4bit_prepack unpack_fn = torch.ops.quantized.embedding_bag_4bit_unpack - self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=4) + self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 4, optimized_qparams) """ Tests the correctness of the embedding_bag_2bit pack/unpack op against C2 """ @given(num_embeddings=st.integers(10, 100), - embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0),) - def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim): + embedding_dim=st.integers(5, 50).filter(lambda x: x % 8 == 0), + optimized_qparams=st.booleans(),) + def test_embedding_bag_2bit_unpack(self, num_embeddings, embedding_dim, optimized_qparams): pack_fn = torch.ops.quantized.embedding_bag_2bit_prepack unpack_fn = torch.ops.quantized.embedding_bag_2bit_unpack - self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, bit_rate=2) + self._test_embedding_bag_unpack_fn(pack_fn, unpack_fn, num_embeddings, embedding_dim, 2, optimized_qparams) def embedding_bag_rowwise_offsets_run( self, bit_rate, num_embeddings, diff --git a/test/quantization/test_quantized_tensor.py b/test/quantization/test_quantized_tensor.py index fc3aa3c655eb..e919deb9d2bd 100644 --- a/test/quantization/test_quantized_tensor.py +++ b/test/quantization/test_quantized_tensor.py @@ -67,6 +67,75 @@ def _calculate_dynamic_qparams(X, dtype, reduce_range=False): def get_supported_device_types(): return ['cpu', 'cuda'] if torch.cuda.is_available() and not TEST_WITH_ROCM else ['cpu'] +# Note we explicitly cast variables to np.float32 in a couple of places to avoid +# the default casting in Python often resuling in double precision and to make +# sure we're doing the same numerics as C++ code. +def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16): + xmin, xmax = np.min(x), np.max(x) + stepsize = (xmax - xmin) / np.float32(n_bins) + min_bins = np.float32(n_bins) * (np.float32(1) - np.float32(ratio)) + xq, loss = _compress_uniform_simplified(x, bit_rate, xmin, xmax) + + solutions = [] # [(left, right, loss)] # local optima solution + + cur_min, cur_max, cur_loss = xmin, xmax, loss + thr = min_bins * stepsize + while cur_min + thr < cur_max: + # move left + xq, loss1 = _compress_uniform_simplified( + x, bit_rate, cur_min + stepsize, cur_max + ) + # move right + xq, loss2 = _compress_uniform_simplified( + x, bit_rate, cur_min, cur_max - stepsize + ) + + if cur_loss < loss1 and cur_loss < loss2: + # found a local optima + solutions.append((cur_min, cur_max, cur_loss)) + if loss1 < loss2: + cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1 + else: + cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2 + if len(solutions): + best = solutions[0] + for solution in solutions: + if solution[-1] < best[-1]: + best = solution + return best[1], best[0] # xmax, xmin + return xmax, xmin + + +def _compress_uniform_simplified(X, bit_rate, xmin, xmax, fp16_scale_bias=True): + # affine transform to put Xq in [0,2**bit_rate - 1] + # Xq = (2 ** bit_rate - 1) * (Xq - xmin) / data_range + if fp16_scale_bias: + xmin = xmin.astype(np.float16).astype(np.float32) + data_range = xmax - xmin + scale = np.where( + data_range == 0, np.float32(1), data_range / np.float32(2 ** bit_rate - 1) + ) + if fp16_scale_bias: + scale = scale.astype(np.float16).astype(np.float32) + inverse_scale = np.float32(1) / scale + Xq = np.clip(np.round((X - xmin) * inverse_scale), 0, np.float32(2 ** bit_rate - 1)) + Xq = Xq * scale + xmin + + # Manually compute loss instead of using np.linalg.norm to use the same + # accumulation order used by C++ code + vlen = 8 + loss_v = np.zeros(vlen).astype(np.float32) + for i in range(len(Xq) // vlen * vlen): + loss_v[i % vlen] += (X[i] - Xq[i]) * (X[i] - Xq[i]) + loss = np.float32(0) + for i in range(vlen): + loss += loss_v[i] + for i in range(len(Xq) // vlen * vlen, len(Xq)): + loss += (X[i] - Xq[i]) * (X[i] - Xq[i]) + loss = np.sqrt(loss) + + return Xq, loss + class TestQuantizedTensor(TestCase): def test_qtensor(self): num_elements = 10 @@ -103,6 +172,36 @@ def test_qtensor(self): "quantization_scheme=torch.per_tensor_affine, " + "scale=1.0, zero_point=2)") + def test_qtensor_sub_byte(self): + num_elements = 10 + scale = 1.0 + zero_point = 2 + for dtype in [torch.quint4x2]: + r = torch.ones((5, 2), dtype=torch.float) + qr = torch.quantize_per_tensor(r, scale, zero_point, dtype) + self.assertEqual(qr.q_scale(), scale) + self.assertEqual(qr.q_zero_point(), zero_point) + self.assertTrue(qr.is_quantized) + self.assertFalse(r.is_quantized) + self.assertEqual(qr.storage().size(), 5) + + int_repr = qr.int_repr() + for num in int_repr[0:5]: + self.assertEqual(num, 51) # Packed entries, each of value 3, i.e. 00110011 + + # Test tensor creation + q = torch._empty_affine_quantized([num_elements], scale=scale, zero_point=zero_point, + dtype=torch.quint4x2) + self.assertEqual(q.storage().size(), 5) + + # Test save/load + with tempfile.NamedTemporaryFile() as f: + torch.save(qr, f) + f.seek(0) + loaded_q = torch.load(f) + loaded_int_repr = loaded_q.int_repr()[0:5] + self.assertEqual(int_repr[0:5], loaded_int_repr) + def test_qtensor_float_assignment(self): # Scalar Tensor # item @@ -216,15 +315,10 @@ def test_qtensor_dtypes(self): r = torch.rand(3, 2, dtype=torch.float) * 4 - 2 scale = 0.2 zero_point = 2 - qr = torch.quantize_per_tensor(r, scale, zero_point, torch.qint8) - rqr = qr.dequantize() - self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) - qr = torch.quantize_per_tensor(r, scale, zero_point, torch.quint8) - rqr = qr.dequantize() - self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) - qr = torch.quantize_per_tensor(r, scale, zero_point, torch.qint32) - rqr = qr.dequantize() - self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) + for dtype in [torch.qint8, torch.quint8, torch.qint32, torch.quint4x2]: + qr = torch.quantize_per_tensor(r, scale, zero_point, dtype) + rqr = qr.dequantize() + self.assertTrue(np.allclose(r.numpy(), rqr.numpy(), atol=2 / scale)) def _test_quantize_per_channel(self, r, scales, zero_points, axis, float_params): @@ -335,6 +429,52 @@ def quantize_ref(data, scales, zero_points): zero_points = torch.tensor([0.1, 0.2, 1.], dtype=torch.float) self._test_quantize_per_channel(r, scales, zero_points, 0, True) + def test_quantize_per_channel_sub_byte(self): + """ Tests the per channel quantization scheme for 4-bit qtensors. + The scale and zero point for this have to be in floating point. """ + r = torch.rand(3, 2, dtype=torch.float) * 4 + scales = torch.tensor([0.2, 0.3, 0.1], dtype=torch.float) + zero_points = torch.tensor([0.1, 0.2, 0.3], dtype=torch.float) + qr = torch.quantize_per_channel(r, scales, zero_points, 0, torch.quint4x2) + dequant_tensor = qr.dequantize() + + def _get_qranges(bit_width): + if bit_width == 4: + return 0, 15 + + def _quantize_per_channel_sub_byte_ref(data, scales, zero_points, axis, bit_width): + dims = data.size() + data = data.view(-1, dims[axis], np.prod(dims[axis + 1:])) + qtensor_size = math.ceil(data.numel() / 2) + res = torch.empty(qtensor_size, dtype=torch.uint8) + elem_per_byte = 8 / bit_width + quant_min, quant_max = _get_qranges(bit_width) + for i in range(data.size()[0]): + for j in range(data.size()[1]): + for k in range(data.size()[2]): + inv_scale = 1.0 / scales[j] + index = i * data.size()[1] * data.size()[2] + j * data.size()[2] + k + qvalue = np.clip( + np.round(data[i][j][k] * inv_scale + zero_points[j]), quant_min, quant_max).to(dtype=torch.int) + res_idx = int(index / elem_per_byte) + if (index % elem_per_byte == 0): + res[res_idx] = qvalue + else: + res[res_idx] |= (qvalue << ((index % elem_per_byte) * bit_width)) + return res + + ref_res = _quantize_per_channel_sub_byte_ref(r, scales, zero_points, 0, 4) + self.assertTrue(np.allclose(qr.int_repr(), ref_res)) + self.assertTrue(np.allclose(r.numpy(), dequant_tensor.numpy(), atol=1 / np.min(scales.numpy()))) + + # Check 4D tensor with non-zero axis. + r = torch.rand(3, 2, 4, 5, dtype=torch.float) * 4 + scales = torch.tensor([0.2, 0.03], dtype=torch.float) + zero_points = torch.tensor([0.1, 0.2], dtype=torch.float) + qr = torch.quantize_per_channel(r, scales, zero_points, axis=1, dtype=torch.quint4x2) + ref_res = _quantize_per_channel_sub_byte_ref(r, scales, zero_points, 1, 4) + self.assertTrue(np.allclose(qr.int_repr(), ref_res)) + def test_qtensor_permute(self): scale = 0.02 zero_point = 1 @@ -422,7 +562,9 @@ def test_qtensor_per_channel_load_save(self): scales = torch.rand(10, dtype=torch.double) * 0.02 + 0.01 zero_points = torch.round(torch.rand(10) * 20 + 1).to(torch.long) # quint32, cuda is not supported yet - for dtype in [torch.quint8, torch.qint8]: + for dtype in [torch.quint8, torch.qint8, torch.quint4x2]: + if dtype == torch.quint4x2: + zero_points = torch.ones(10, dtype=torch.float) qr = torch.quantize_per_channel(r, scales, zero_points, 1, dtype) with tempfile.NamedTemporaryFile() as f: # Serializing and Deserializing Tensor @@ -745,3 +887,11 @@ def test_fp16_saturate_op(self): ref[0] = torch.ones(5) * -65504 y = torch._saturate_weight_to_fp16(x) self.assertEqual(y, ref) + + def test_choose_qparams_optimized(self): + for bit_width in [4, 2]: + x = torch.randn(64, dtype=torch.float) + y = torch.choose_qparams_optimized(x, numel=64, n_bins=200, ratio=0.16, bit_width=bit_width) + ref = param_search_greedy(x.numpy(), bit_rate=bit_width) + self.assertEqual(y[0].numpy(), ref[0]) + self.assertEqual(y[1].numpy(), ref[1]) diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index 817e54460e07..6d1dd2b1b698 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -5,7 +5,6 @@ PerChannelMinMaxObserver, MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver, - MinMaxDynamicQuantObserver, HistogramObserver, RecordingObserver, PlaceholderObserver, @@ -16,6 +15,7 @@ default_per_channel_weight_observer, get_observer_dict, prepare, + QConfig, ) from torch.quantization._learnable_fake_quantize import ( @@ -44,6 +44,7 @@ QuantizationTestCase, AnnotatedSingleLayerLinearModel, test_only_eval_fn, + SingleLayerLinearModel, ) from torch.testing._internal.common_quantized import ( @@ -265,25 +266,6 @@ def test_per_tensor_observers(self, qdtype, qscheme, reduce_range): self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams()) - @given(X=hu.tensor(shapes=hu.array_shapes(min_dims=2, max_dims=4, - min_side=1, max_side=10), - qparams=hu.qparams()), - reduce_range=st.booleans()) - def test_per_tensor_dynamic_quant_observers(self, X, reduce_range): - - X, (scale, zero_point, torch_type) = X - x = torch.from_numpy(X) - - obs = MinMaxDynamicQuantObserver(dtype=torch.quint8, reduce_range=reduce_range) - - result = obs(x) - qparams = obs.calculate_qparams() - ref = torch._choose_qparams_per_tensor(x, reduce_range) - - self.assertEqual(ref[0], qparams[0]) - self.assertEqual(ref[1], qparams[1]) - - @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)), qscheme=st.sampled_from((torch.per_channel_affine, torch.per_channel_symmetric, torch.per_channel_affine_float_qparams)), ch_axis=st.sampled_from((0, 1, 2, 3)), reduce_range=st.booleans()) @@ -394,7 +376,7 @@ def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range): def test_observer_scriptable(self): - obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver(), MinMaxDynamicQuantObserver()] + obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver()] for obs in obs_list: scripted = torch.jit.script(obs) @@ -423,7 +405,7 @@ def test_state_dict_respects_device_affinity(self): [device_cpu, device_cuda], [device_cpu, device_cuda], [MinMaxObserver, MovingAverageMinMaxObserver, - MinMaxDynamicQuantObserver, PerChannelMinMaxObserver, + PerChannelMinMaxObserver, MovingAveragePerChannelMinMaxObserver, # TODO: enable this (separate PR) # HistogramObserver, @@ -473,6 +455,32 @@ def test_histogram_observer_save_load_state_dict(self): self.assertEqual(obs2.max_val.shape, torch.Size([])) + def test_save_load_state_dict_script(self): + """ + Tests that we can save and load state_dict for observers that are scripted + in a quantized model. + """ + obs_list = [MinMaxObserver, MovingAverageMinMaxObserver, + PerChannelMinMaxObserver, + MovingAveragePerChannelMinMaxObserver, HistogramObserver] + + for obs in obs_list: + model = SingleLayerLinearModel().eval() + qconfig = QConfig(activation=default_observer, weight=obs) + qconfig_dict = {'' : qconfig} + scripted = torch.jit.script(model) + scripted = torch.quantization.prepare_jit(scripted, qconfig_dict) + x = torch.rand(5, 5) + scripted(x) + obs_dict = torch.quantization.get_observer_state_dict(scripted) + + # Load stats + scripted_2 = torch.jit.script(model) + scripted_2 = torch.quantization.prepare_jit(scripted_2, qconfig_dict) + torch.quantization.load_observer_state_dict(scripted_2, obs_dict) + # Verify that state_dict matches exactly with original one. + self.assertEqual(scripted.state_dict(), scripted_2.state_dict()) + # HistogramObserver that works like it does on master class _ReferenceHistogramObserver(HistogramObserver): def __init__(self, *args, **kwargs): @@ -1417,7 +1425,6 @@ def test_observers_preserve_buffers(self): observer_types = [ torch.quantization.MinMaxObserver.with_args(dtype=torch.qint8), torch.quantization.MovingAverageMinMaxObserver.with_args(dtype=torch.qint8), - torch.quantization.MinMaxDynamicQuantObserver.with_args(dtype=torch.qint8), torch.quantization.PerChannelMinMaxObserver.with_args(dtype=torch.qint8), torch.quantization.MovingAveragePerChannelMinMaxObserver.with_args(dtype=torch.qint8), torch.quantization.HistogramObserver.with_args(dtype=torch.qint8), @@ -1536,6 +1543,21 @@ def forward(self, x): isinstance(fused_model.conv.bn, nn.SyncBatchNorm), "Expected BN to be converted to SyncBN") + def test_syncbn_preserves_qconfig(self): + """ + Makes sure that if a BatchNorm is not fused and a qconfig exists, + convering the module to SyncBatchNorm preserves the qconfig. + """ + m = nn.Sequential( + nn.Conv2d(1, 1, 1), + nn.BatchNorm2d(1), + ) + m[1].qconfig = torch.quantization.default_qconfig + m = torch.nn.SyncBatchNorm.convert_sync_batchnorm(m) + self.assertTrue( + hasattr(m[1], "qconfig"), + "missing qconfig after SyncBatchNorm conversion") + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @override_qengines diff --git a/test/run_test.py b/test/run_test.py index 606e20a6f723..2af7405e300b 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -13,7 +13,7 @@ import torch import torch._six from torch.utils import cpp_extension -from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell +from torch.testing._internal.common_utils import TEST_WITH_ROCM, shell, FILE_SCHEMA import torch.distributed as dist from typing import Dict, Optional @@ -41,6 +41,7 @@ 'test_foreach', 'test_indexing', 'test_jit', + 'test_linalg', 'test_logging', 'test_mkldnn', 'test_multiprocessing', @@ -89,7 +90,8 @@ 'test_determination', 'test_futures', 'test_fx', - 'test_functional_autograd_benchmark' + 'test_functional_autograd_benchmark', + 'test_package', ] WINDOWS_BLOCKLIST = [ @@ -98,7 +100,6 @@ 'distributed/rpc/test_process_group_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/test_distributed_fork', - 'distributed/test_distributed_spawn', ] ROCM_BLOCKLIST = [ @@ -109,7 +110,6 @@ 'test_determination', 'test_multiprocessing', 'test_jit_legacy', - 'test_tensorexpr', 'test_type_hints', 'test_openmp', ] @@ -200,6 +200,15 @@ PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE")) +JIT_EXECUTOR_TESTS = [ + 'test_jit_cuda_fuser_profiling', + 'test_jit_cuda_fuser_legacy', + 'test_jit_profiling', + 'test_jit_legacy', + 'test_jit_fuser_legacy', + 'test_jit_fuser_te', + 'test_tensorexpr'] + def print_to_stderr(message): print(message, file=sys.stderr) @@ -305,9 +314,13 @@ def test_distributed(test_module, test_directory, options): 'MPI not available -- MPI backend tests will be skipped') config = DISTRIBUTED_TESTS_CONFIG for backend, env_vars in config.items(): + if sys.platform == 'win32' and backend != 'gloo': + continue if backend == 'mpi' and not mpi_available: continue for with_init_file in {True, False}: + if sys.platform == 'win32' and not with_init_file: + continue tmp_dir = tempfile.mkdtemp() if options.verbose: init_str = "with {} init_method" @@ -321,9 +334,9 @@ def test_distributed(test_module, test_directory, options): os.environ.update(env_vars) if with_init_file: if test_module in ["test_distributed_fork", "test_distributed_spawn"]: - init_method = 'file://{}/'.format(tmp_dir) + init_method = f'{FILE_SCHEMA}{tmp_dir}/' else: - init_method = 'file://{}/shared_init_file'.format(tmp_dir) + init_method = f'{FILE_SCHEMA}{tmp_dir}/shared_init_file' os.environ['INIT_METHOD'] = init_method try: os.mkdir(os.path.join(tmp_dir, 'barrier')) @@ -446,6 +459,19 @@ def parse_args(): nargs='*', help='additional arguments passed through to unittest, e.g., ' 'python run_test.py -i sparse -- TestSparse.test_factory_size_check') + parser.add_argument( + '--shard', + nargs=2, + type=int, + help='runs a shard of the tests (taking into account other selections), e.g., ' + '--shard 2 3 will break up the selected tests into 3 shards and run the tests ' + 'in the 2nd shard (the first number should not exceed the second)', + ) + parser.add_argument( + '--exclude-jit-executor', + action='store_true', + help='exclude tests that are run for a specific jit config' + ) return parser.parse_args() @@ -513,6 +539,17 @@ def get_selected_tests(options): last_index = find_test_index(options.last, selected_tests, find_last_index=True) selected_tests = selected_tests[:last_index + 1] + if options.shard: + assert len(options.shard) == 2, "Unexpected shard format" + assert min(options.shard) > 0, "Shards must be positive numbers" + which_shard, num_shards = options.shard + assert which_shard <= num_shards, "Selected shard must be less or equal that total number of shards" + assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}" + selected_tests = selected_tests[which_shard - 1 :: num_shards] + + if options.exclude_jit_executor: + options.exclude.extend(JIT_EXECUTOR_TESTS) + selected_tests = exclude_tests(options.exclude, selected_tests) if sys.platform == 'win32' and not options.ignore_win_blocklist: diff --git a/test/test_autograd.py b/test/test_autograd.py index 9d037fd7c138..6bd6925e015f 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -31,7 +31,7 @@ from torch.utils.checkpoint import checkpoint from torch.testing._internal.common_utils import (TEST_MKL, TEST_WITH_ROCM, TestCase, run_tests, skipIfNoLapack, suppress_warnings, slowTest, - load_tests, random_symmetric_pd_matrix, random_symmetric_matrix, + load_tests, random_symmetric_matrix, IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck) from torch.autograd import Variable, Function, detect_anomaly from torch.autograd.function import InplaceFunction @@ -1001,6 +1001,53 @@ def gen_enable_grad(): for _ in gen_enable_grad(): self.assertEqual(torch.is_grad_enabled(), False) + def test_set_grad_generator_functions_recursive(self): + # enable_grad_decorator_recursive and no_grad_decorator_recursive call each other + # recursively, to ensure that the decorators preserve the caller's setting + @torch.enable_grad() + def enable_grad_decorator_recursive(depth): + self.assertTrue(torch.is_grad_enabled()) + if depth > 0: + no_grad_decorator_recursive(depth - 1) + self.assertTrue(torch.is_grad_enabled()) + + @torch.no_grad() + def no_grad_decorator_recursive(depth): + self.assertFalse(torch.is_grad_enabled()) + if depth > 0: + enable_grad_decorator_recursive(depth - 1) + self.assertFalse(torch.is_grad_enabled()) + + # enable_grad_context_manager_recursive and no_grad_context_manager_recursive call + # each other recursively, to ensure that the decorators preserve the caller's setting + def enable_grad_context_manager_recursive(depth): + with torch.enable_grad(): + self.assertTrue(torch.is_grad_enabled()) + if depth > 0: + no_grad_context_manager_recursive(depth - 1) + self.assertTrue(torch.is_grad_enabled()) + + def no_grad_context_manager_recursive(depth): + with torch.no_grad(): + self.assertFalse(torch.is_grad_enabled()) + if depth > 0: + enable_grad_context_manager_recursive(depth - 1) + self.assertFalse(torch.is_grad_enabled()) + + with torch.enable_grad(): + self.assertTrue(torch.is_grad_enabled()) + enable_grad_decorator_recursive(10) + self.assertTrue(torch.is_grad_enabled()) + enable_grad_context_manager_recursive(10) + self.assertTrue(torch.is_grad_enabled()) + + with torch.no_grad(): + self.assertFalse(torch.is_grad_enabled()) + enable_grad_decorator_recursive(10) + self.assertFalse(torch.is_grad_enabled()) + enable_grad_context_manager_recursive(10) + self.assertFalse(torch.is_grad_enabled()) + def test_no_grad_python_function(self): """Python Functions should respect grad mode.""" x = torch.ones(5, 5, requires_grad=True) @@ -2454,22 +2501,28 @@ def test_var_mean_differentiable(self): @skipIfNoLapack def test_cholesky(self): def func(root, upper): - x = torch.matmul(root, root.transpose(-1, -2)) + 1e-05 + x = 0.5 * (root + root.transpose(-1, -2).conj()) return torch.cholesky(x, upper) - def run_test(upper, dims): - root = torch.rand(*dims, requires_grad=True) + def run_test(upper, dims, dtype): + root = torch.rand(*dims, dtype=dtype, requires_grad=True) + root = root + torch.eye(dims[-1]) gradcheck(func, [root, upper]) - gradgradcheck(func, [root, upper]) + # TODO: gradgradcheck does not work correctly yet for complex + if not dtype.is_complex: + gradgradcheck(func, [root, upper]) - root = random_symmetric_pd_matrix(dims[-1], *dims[:-2]).requires_grad_() + root = torch.rand(*dims, dtype=dtype) + root = torch.matmul(root, root.transpose(-1, -2).conj()) + root.requires_grad_() chol = root.cholesky().sum().backward() - self.assertEqual(root.grad, root.grad.transpose(-1, -2)) # Check the gradient is symmetric + self.assertEqual(root.grad, root.grad.transpose(-1, -2).conj()) # Check the gradient is hermitian - for upper, dims in product([True, False], [(3, 3), (4, 3, 2, 2)]): - run_test(upper, dims) - run_test(upper, dims) + for upper, dims, dtype in product([True, False], + [(3, 3), (4, 3, 2, 2)], + [torch.double, torch.cdouble]): + run_test(upper, dims, dtype) @skipIfNoLapack def test_cholesky_solve(self): @@ -2545,6 +2598,67 @@ def run_test(upper, dims): for upper, dims in product([True, False], [(3, 3), (5, 3, 3), (4, 3, 2, 2)]): run_test(upper, dims) + @slowTest + @skipIfNoLapack + def test_lobpcg(self): + + def func(k, A, largest=True, B=None): + X_shape = list(A.shape) + X_shape[-1] = k + X = torch.eye(A.size(-2), k, dtype=A.dtype, device=A.device) + if A.dim() > 2: + X = X.expand(X_shape) + + D, U = torch.lobpcg(A=A, k=k, B=B, X=X) + + # LOBPCG uses a random initial eigenspace approximation + # if parameter `X` is not provided. + # This may cause a non-deterministic behavior + # when it comes to the sign of an eigenvector + # (note if v is an eigenvector, so is -v), + # hence we eliminate this non-determinism + # by making sure that each column of U + # gets multiplied by the sign of its max (in absolute value) element. + # Also, gradcheck changes the content of the input by +/- eps (default to 1e-06) + # to compute the numerical gradient which can also cause the signs to flip. + _, idx = U.abs().max(-2, keepdim=True) + sign = U.gather(-2, idx).sign() + U = U * sign + return D, U + + def run_symeig_test(k, sizes, largest=True): + A = torch.rand(*sizes).double() + A = A.matmul(A.transpose(-1, -2)) / 10 + A.requires_grad_(True) + + gradcheck(lambda A: func(k, A, largest), A) + + # Custom gradient vectors for better stability due to some + # non-determinism in the lobpcg's forward. + # Note it is not required if symeig is in forward instead (tested). + D_grad = torch.rand(*A.shape[:-2], k) / 100 + U_grad = torch.rand(*A.shape[:-1], k) / 100 + gradgradcheck(lambda A: func(k, A, largest), A, [D_grad, U_grad], atol=1e-4) + + # check whether A.grad is symmetric + A = A.detach().requires_grad_(True) + D, U = func(k, A, largest) + (D.sum() + U.sum()).backward() + self.assertEqual(A.grad, A.grad.transpose(-1, -2)) + + # the tests below take about 1-2 minutes to finish, + # but we want to be extra sure that the backward is correct. + for largest in [True, False]: + run_symeig_test(1, (6, 6), largest=largest) + run_symeig_test(1, (2, 6, 6), largest=largest) + run_symeig_test(1, (2, 2, 6, 6), largest=largest) + run_symeig_test(2, (6, 6), largest=largest) + run_symeig_test(2, (2, 6, 6), largest=largest) + run_symeig_test(2, (2, 2, 6, 6), largest=largest) + run_symeig_test(3, (9, 9), largest=largest) + run_symeig_test(3, (2, 9, 9), largest=largest) + run_symeig_test(3, (2, 2, 9, 9), largest=largest) + @skipIfNoLapack def test_cholesky_inverse(self): def _test_with_size(upper, dims): @@ -4527,6 +4641,33 @@ def test(inp, inp_dtype, out_dtype): test(inp, torch.float, torch.double) test(inp, torch.double, torch.float) + def test_nan_to_num(self): + a = torch.randn(3, 3, 3, 3) + with torch.no_grad(): + a[torch.rand_like(a) < 0.2] = float('nan') + a[torch.rand_like(a) < 0.2] = float('inf') + a[torch.rand_like(a) < 0.2] = -float('inf') + + a.requires_grad = True + + gradcheck(lambda x: x.nan_to_num(), a) + gradgradcheck(lambda x: x.nan_to_num(), a) + + gradcheck(lambda x: x.nan_to_num(nan=1.2), a) + gradgradcheck(lambda x: x.nan_to_num(nan=1.2), a) + + gradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0), a) + gradgradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0), a) + + gradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0, neginf=-2.0), a) + gradgradcheck(lambda x: x.nan_to_num(nan=1.2, posinf=2.0, neginf=-2.0), a) + + gradcheck(lambda x: x.nan_to_num(posinf=2.0, neginf=-2.0), a) + gradgradcheck(lambda x: x.nan_to_num(posinf=2.0, neginf=-2.0), a) + + gradcheck(lambda x: x.nan_to_num(neginf=-2.0), a) + gradgradcheck(lambda x: x.nan_to_num(neginf=-2.0), a) + def test_custom_function_error(self): class BadFw(Function): @staticmethod @@ -4690,10 +4831,11 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks, complex_list = ['t', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone', 'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose', 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu', - 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'round', + 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'eq_', 'ne_', 'add', '__radd__', 'sum', 'conj', 'sin', 'cos', 'mul', 'sinh', - 'cosh', '__rmul__'] + separate_complex_tests + 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot'] + separate_complex_tests +# TODO(@anjali411): add tests for 'sub', 'div # TODO(@anjali411): add the commented tests back after updating the formula based on tensorflow definition - @anjali411 # complex_list += ['fill_', 't', '__rdiv__', 'tanh'] @@ -4817,7 +4959,9 @@ def fn(*inputs): 'broadcast_all' in test_name or 'atanh' in test_name or 'acosh' in test_name or - 'asinh' in test_name) + 'asinh' in test_name or + 'abs_complex' in test_name or + 'abs_scalar_complex' in test_name) if hasattr(torch.ones(1), inplace_name) and not skip_inplace: output_variable = getattr(self_variable, name)(*args_variable, **kwargs_variable) if not isinstance(output_variable, tuple): @@ -4864,7 +5008,10 @@ def fn(*inputs): inplace_name = name + '_' # can't broadcast inplace to left hand side broadcast_skip_inplace = 'broadcast_lhs' in test_name or 'broadcast_all' in test_name - if hasattr(torch.ones(1), inplace_name) and not broadcast_skip_inplace: + # skip C -> R inplace tests + skip_c_to_r_inplace = 'abs_complex' in test_name or 'abs_scalar_complex' in test_name + skip_inplace = broadcast_skip_inplace or skip_c_to_r_inplace + if hasattr(torch.ones(1), inplace_name) and not skip_inplace: check(inplace_name) assert not hasattr(TestAutograd, test_name), 'Two tests have the same name: ' + test_name @@ -5936,11 +6083,13 @@ class TestAutogradDeviceType(TestCase): def test_min_max_median_backprops_to_all_values(self, device): for f in [torch.min, torch.max, torch.median]: - x = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True) - y = f(x) - y.backward() - self.assertEqual(x.grad.sum(), 1.) - self.assertEqual((x.grad == 1 / 3).sum(), 3) + x1 = torch.tensor([1., 0., 1., 0., 1., 0.], device=device, requires_grad=True) + x2 = torch.tensor([float('nan'), float('nan'), float('nan')], requires_grad=True) + for x in [x1, x2]: + y = f(x) + y.backward() + self.assertEqual(x.grad.sum(), 1.) + self.assertEqual((x.grad == 1 / 3).sum(), 3) # skip this test if running on rocm, because in cdist # we use __shfl_down_sync on CUDA for fast reduction @@ -5990,6 +6139,18 @@ def _test_euclidean_large_cdist(sizex, sizey=None): _test_cdist_for_size((1, 1), (S, 1)) _test_euclidean_large_cdist((2000, 5)) + # Ensure that cdist backward with p<1 does not produce NaNs + def test_cdist_grad_p_lt_1_no_nan(self, device): + for p in [0.99, 0.7, 0.5, 0.1, 0.01]: + x = torch.randn(1, 2, device=device) + y = x.clone().detach() + torch.tensor([[1., 0.]], device=device) + x.requires_grad = True + y.requires_grad = True + result = torch.cdist(x, y, p=p) + result.backward(torch.ones_like(result)) + self.assertFalse(torch.isnan(x.grad).any()) + self.assertFalse(torch.isnan(y.grad).any()) + def test_cdist_same_inputs(self, device): # Test to detect issues in cdist gradient calculation # When the distances are 0 diff --git a/test/test_cuda.py b/test/test_cuda.py index 011e8c374645..498fd199066f 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -1,6 +1,7 @@ import collections import io import tempfile +from typing import NamedTuple import unittest import sys from itertools import repeat, chain, product @@ -14,6 +15,7 @@ import torch.cuda import torch.cuda.comm as comm from torch import multiprocessing as mp +from torch.nn.parallel import scatter_gather from torch._six import inf, nan, container_abcs from test_torch import AbstractTestCases @@ -21,7 +23,7 @@ from torch.testing._internal.common_methods_invocations import tri_tests_args, tri_large_tests_args, \ _compare_trilu_indices, _compare_large_trilu_indices from torch.testing._internal.common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \ - NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, \ + NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_SANDCASTLE, \ slowTest, skipCUDANonDefaultStreamIf, TEST_WITH_ROCM, TEST_NUMPY from torch.testing._internal.autocast_test_lists import AutocastTestLists @@ -279,6 +281,18 @@ def assert_change(comp=1, empty_cache=False, reset_peak=False): assert_change(0, empty_cache=True) assert_change(0, reset_peak=True) + @skipIfRocm + def test_cudart_register(self): + t = torch.ones(20) + self.assertFalse(t.is_pinned()) + cudart = torch.cuda.cudart() + r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0) + self.assertEquals(r, 0) + self.assertTrue(t.is_pinned()) + r = cudart.cudaHostUnregister(t.data_ptr()) + self.assertEquals(r, 0) + self.assertFalse(t.is_pinned()) + def test_memory_stats(self): gc.collect() torch.cuda.empty_cache() @@ -1720,6 +1734,7 @@ def test_streaming_backwards_device_transfer(self): self.assertTrue(b.grad.sum().item() == 4 * size) @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") + @unittest.skipIf(not IS_SANDCASTLE, "Does not work on Sandcastle") def test_cuda_init_race(self): # See https://github.com/pytorch/pytorch/issues/16559 import subprocess @@ -1736,32 +1751,102 @@ def worker(rank): t2.start() """]) - def test_grad_scaling_builtins(self, device="cuda", dtype=torch.float): - inv_scale = torch.tensor([0.25], dtype=dtype, device=device) + def test_grad_scaling_unscale(self, dtype=torch.float): + inv_scale = torch.full((1,), 0.25, dtype=torch.float, device="cuda:0") + found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0") + + size = 10 + g = torch.full((size, size), 4.0, dtype=dtype, device="cuda:0") + ginf = g.clone() + ginf[2, 2] = float('inf') + gnan = g.clone() + gnan[2, 2] = float('nan') + + # Tries selected combinations of + # - contiguous grads + # - g.clone().t() which is not contiguous but still non overlapping and dense + # - variants of g.clone()[:, :5] which are not non overlapping and dense + # Non overlapping and dense grads route into a multi tensor apply kernel, + # others use a fallback per-tensor kernel, so we should try both. + cases = ( + ([g.clone(), g.clone()], False), + ([g.clone(), g.clone().t()], False), + ([g.clone(), g.clone()[:, :5]], False), + ([g.clone()[:, :5], g.clone()[:, :5]], False), + ([g.clone(), ginf.clone()], True), + ([g.clone(), gnan.clone()], True), + ([g.clone(), ginf.clone()[:, :5]], True), + ([g.clone(), gnan.clone()[:, :5]], True), + ([ginf.clone(), g.clone()[:, :5]], True), + ([ginf.clone()[:, :5], g.clone()[:, :5]], True), + ) + + for grads, has_inf in cases: + found_inf.zero_() + torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale) + if has_inf: + self.assertEqual(found_inf, 1.0) + else: + self.assertEqual(found_inf, 0.0) + for grad in grads: + self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7)) - found_inf = torch.tensor([0.0], dtype=dtype, device=device) - g = torch.tensor([4.0], dtype=dtype, device=device) - torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale) - self.assertEqual(found_inf, 0.0) - self.assertTrue(torch.allclose(g, torch.ones(10, dtype=torch.float32, device="cuda"), atol=1e-7)) + # Passing lists with mismatched devices or dtypes to a raw + # _amp_foreach_non_finite_check_and_unscale_ call should raise errors. + with self.assertRaisesRegex(RuntimeError, r"must have the same dtype"): + torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(dtype=torch.float16)], + found_inf, + inv_scale) - found_inf.zero_() - g = torch.tensor([float('inf')], dtype=dtype, device=device) - torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale) - self.assertEqual(found_inf, 1.0) + if TEST_MULTIGPU: + with self.assertRaisesRegex(RuntimeError, r"scaled_grads must be on the same device."): + torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")], + found_inf, + inv_scale) + + # Creates a list of grads with mismatched dtypes and devices, to ensure + # scaler._unscale_grads_ organizes grads by dtype and device before calling + # _amp_foreach_non_finite_check_and_unscale_ on each set. + # If inject_inf >= 0, writes an inf into one grad for _unscale_grads_ to find. + def perfect_storm_grads(inject_inf): + grads = [g.clone(), g.clone()[:, :5], g.to(dtype=torch.float16), g.to(dtype=torch.float16)] + if TEST_MULTIGPU: + grads += [g.to(device="cuda:1"), + g.to(device="cuda:1")[:, :5], + g.to(device="cuda:1", dtype=torch.float16), + g.to(device="cuda:1", dtype=torch.float16)] + if inject_inf >= 0: + grads[inject_inf][2, 2] = float('inf') + return grads - found_inf.zero_() - g = torch.tensor([float('nan')], dtype=dtype, device=device) - torch._amp_non_finite_check_and_unscale_(g, found_inf, inv_scale) - self.assertEqual(found_inf, 1.0) + scaler = torch.cuda.amp.GradScaler() + dummy_params = [torch.empty_like(g) for g in perfect_storm_grads(-1)] + dummy_opt = torch.optim.SGD(dummy_params, lr=1.) + + # Ensures the inf/nan checking can find an inf injected onto any grad in the perfect storm. + for inject_inf in range(-1, len(dummy_params)): + found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0") + grads = perfect_storm_grads(inject_inf) + for i, p in enumerate(dummy_params): + p.grad = grads[i] + found_inf_per_device = scaler._unscale_grads_(dummy_opt, inv_scale, found_inf, True) + if inject_inf < 0: + # No inf was injected, ensures unscaling worked normally. + self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0) + for grad in grads: + self.assertTrue(torch.allclose(grad, torch.ones_like(grad), atol=1e-7)) + else: + # inf was injected, ensures inf was found. + self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1) + def test_grad_scaling_update_scale(self, device="cuda", dtype=torch.float): growth = 2.0 backoff = 0.25 growth_interval = 2 - scale = torch.tensor([4.0], dtype=dtype, device=device) - growth_tracker = torch.tensor([0], dtype=torch.int32, device=device) + scale = torch.full((1,), 4.0, dtype=dtype, device=device) + growth_tracker = torch.full((1,), 0.0, dtype=torch.int32, device=device) + found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0") - found_inf.zero_() # Simulates 2 consecutive unskipped iterations scale = torch._amp_update_scale(growth_tracker, scale, found_inf, growth, backoff, growth_interval) self.assertEqual(growth_tracker, 1) @@ -1779,7 +1864,7 @@ def test_grad_scaling_builtins(self, device="cuda", dtype=torch.float): def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float): scaler = torch.cuda.amp.GradScaler() - inv_scale = torch.tensor([0.25], dtype=dtype, device=device) + inv_scale = torch.full((1,), 0.25, dtype=dtype, device=device) found_inf = torch.empty((1,), dtype=dtype, device=device) cur = found_inf.device @@ -1842,6 +1927,7 @@ def test_grad_scaling_device_as_key(self): # are treated as identical keys by dicts. GradScaler relies on this behavior, and may # error otherwise in a way that's difficult to detect (a silent performance hit). d = {} + t = torch.empty((1,), device="cuda:0") dev0a = torch.device("cuda:0") dev0b = torch.device("cuda:0") dev1a = torch.device("cuda:1") @@ -1854,6 +1940,9 @@ def test_grad_scaling_device_as_key(self): d[dev0b] = "0b" self.assertTrue(len(d) == 1) self.assertTrue(d[dev0a] == "0b") + d[t.device] = "t" + self.assertTrue(len(d) == 1) + self.assertTrue(d[dev0a] == "t") d[dev1a] = "1a" d[dev1b] = "1b" @@ -1863,8 +1952,8 @@ def test_grad_scaling_device_as_key(self): @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") def test_grad_scaling_scale(self): scaler = torch.cuda.amp.GradScaler(init_scale=2.) - t0 = torch.tensor([4.0], dtype=torch.float32, device="cuda:0") - t1 = torch.tensor([4.0], dtype=torch.float32, device="cuda:1") + t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0") + t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1") # Create some nested iterables of tensors on different devices. outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())]) outputs = scaler.scale(outputs) @@ -1882,7 +1971,7 @@ def test_grad_scaling_state_dict(self): if lazy_init_scale: # Dummy scale() call to ensure the scale tensor is lazily initialized. - s1.scale(torch.tensor([4.0], dtype=torch.float32, device="cuda:0")) + s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")) self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor)) s1.load_state_dict(s0.state_dict()) @@ -2393,7 +2482,7 @@ def cast(val, to_type): "{} not found as an attribute on either Tensor or the requested module {}".format( op, module)) - # Accounts for ops that return tuples and other non-Tensors. + # Accounts for ops that return Tensors, iterables, and other non-Tensors. # For example, lstm_cell returns a tuple and equal returns bool. def compare(first, second): if isinstance(first, torch.Tensor): @@ -3047,6 +3136,48 @@ def test_matmul_device_mismatch(self): with self.assertRaisesRegex(RuntimeError, "expected (it|them) to be on GPU"): torch.addmm(s, m1, m2) + @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs") + def test_scatter_namedtuple(self): + # tests ability to scatter namedtuples and retrieve a list where each + # element is of the expected namedtuple type. + fields = ("a", "b") + TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields) + num_gpus = torch.cuda.device_count() + a = torch.rand(num_gpus * 2, device=0) + b = torch.rand(num_gpus * 2, device=0) + a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] + b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] + + inp = TestNamedTupleInput_0(a, b) + target_gpus = [torch.device(i) for i in range(num_gpus)] + scatter_out = scatter_gather.scatter(inp, target_gpus) + + for i, x in enumerate(scatter_out): + self.assertTrue(isinstance(x, type(inp))) + self.assertEqual(x._fields, fields) + expected_a = a_tensors_for_gpu[i] + expected_b = b_tensors_for_gpu[i] + self.assertEqual(expected_a, x.a) + self.assertEqual(expected_b, x.b) + + class TestNamedTupleInput_1(NamedTuple): + a: torch.tensor + b: torch.tensor + + a = torch.rand(num_gpus * 2, device=0) + b = torch.rand(num_gpus * 2, device=0) + a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] + b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] + inp = TestNamedTupleInput_1(a, b) + + scatter_out = scatter_gather.scatter(inp, target_gpus) + for i, x in enumerate(scatter_out): + self.assertTrue(isinstance(x, type(inp))) + self.assertEqual(x._fields, fields) + expected_a = a_tensors_for_gpu[i] + expected_b = b_tensors_for_gpu[i] + self.assertEqual(expected_a, x.a) + self.assertEqual(expected_b, x.b) if __name__ == '__main__': run_tests() diff --git a/test/test_dataloader.py b/test/test_dataloader.py index ce23593ec7bc..9074cc3c0b7d 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -11,8 +11,10 @@ import itertools import warnings import tempfile +import random from torch import multiprocessing as mp -from torch.utils.data import _utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, ChainDataset +from torch.utils.data import (_utils, Dataset, IterableDataset, TensorDataset, DataLoader, ConcatDataset, + ChainDataset, BufferedShuffleDataset) from torch.utils.data._utils import MP_STATUS_CHECK_INTERVAL from torch.utils.data.dataset import random_split from torch._utils import ExceptionWrapper @@ -710,6 +712,10 @@ def init_fn(worker_id): torch.manual_seed(12345) +def shuffle_ds_init_fn(worker_id): + random.seed(123) + + # used with test_error_in_init class ErrorIterableDataset(IterableDataset): def __iter__(self): @@ -1213,6 +1219,37 @@ def test_chain_iterable_style_dataset(self): with self.assertRaisesRegex(AssertionError, "ChainDataset only supports IterableDataset"): list(iter(ChainDataset([dataset1, self.dataset]))) + def test_buffer_shuffle_dataset(self): + dataset = CountingIterableDataset(20) + expected = list(range(20)) + buffer_sizes = [5, 20, 25] + for num_workers in [0, 1]: + # Buffer Size <= 1: Not shuffled dataset + fetched_nos = list(self._get_data_loader(BufferedShuffleDataset(dataset, 1), num_workers=num_workers)) + self.assertEqual(len(fetched_nos), len(expected)) + for e, d in zip(expected, fetched_nos): + self.assertIsInstance(d, torch.Tensor) + self.assertEqual(e, d) + # Buffer Size > 1: Shuffled dataset + for buffer_size in buffer_sizes: + fetched = sorted(list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers))) + self.assertEqual(len(fetched), len(expected)) + for e, d in zip(expected, fetched): + self.assertIsInstance(d, torch.Tensor) + self.assertEqual(e, d) + # Random Seed for single process + random.seed(123) + fetched_seed1 = list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers, + worker_init_fn=shuffle_ds_init_fn)) + random.seed(123) + fetched_seed2 = list(self._get_data_loader(BufferedShuffleDataset(dataset, buffer_size), num_workers=num_workers, + worker_init_fn=shuffle_ds_init_fn)) + self.assertEqual(len(fetched_seed1), len(fetched_seed2)) + for d1, d2 in zip(fetched_seed1, fetched_seed2): + self.assertIsInstance(d1, torch.Tensor) + self.assertIsInstance(d2, torch.Tensor) + self.assertEqual(d1, d2) + def test_multiprocessing_contexts(self): reference = [ torch.arange(3), diff --git a/test/test_dispatch.py b/test/test_dispatch.py index ec9fd20797e3..45480d8916f0 100644 --- a/test/test_dispatch.py +++ b/test/test_dispatch.py @@ -229,11 +229,11 @@ def test_def(self): # m.impl("test_def", [](const Tensor& x) { return x }) lambda m: m.impl_t_t("foo"), # m.impl("test_def", kCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", dispatch="cpu"), + lambda m: m.impl_t_t("foo", dispatch="CPU"), # m.impl("test_def", kAutograd, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", dispatch="autograd"), + lambda m: m.impl_t_t("foo", dispatch="Autograd"), # m.impl("test_def", kAutogradCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", dispatch="autogradcpu") + lambda m: m.impl_t_t("foo", dispatch="AutogradCPU") ]).state self.assertExpectedInline(state, '''\ name: test::foo @@ -262,11 +262,11 @@ def test_def_with_inference(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu"), + lambda m: m.impl_t_t("foo", "CPU"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd"), + lambda m: m.impl_t_t("foo", "Autograd"), # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autogradcpu") + lambda m: m.impl_t_t("foo", "AutogradCPU") ]).state self.assertExpectedInline(state, '''\ name: test::foo @@ -296,11 +296,11 @@ def test_impl_only(self): # m.impl("foo", [](const Tensor& x) { return x }) lambda m: m.impl_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", "cpu"), + lambda m: m.impl_t_t("foo", "CPU"), # m.impl("foo", torch::kAutograd, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", "autograd"), + lambda m: m.impl_t_t("foo", "Autograd"), # m.impl("foo", torch::kAutogradCPU, [](const Tensor& x) { return x }) - lambda m: m.impl_t_t("foo", "autogradcpu") + lambda m: m.impl_t_t("foo", "AutogradCPU") ]).state self.assertExpectedInline(state, '''\ name: test::foo @@ -316,13 +316,13 @@ def test_computed_table(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kCUDA, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "xla", debug="fn_xla"), + lambda m: m.impl_t_t("foo", "XLA", debug="fn_xla"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"), + lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"), # m.impl("foo", torch::kAutogradCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autogradcpu", debug="fn_autogradcpu") + lambda m: m.impl_t_t("foo", "AutogradCPU", debug="fn_autogradcpu") ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -351,12 +351,12 @@ def test_computed_table(self): ''') def test_computed_table_with_cpu_catchall(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu"), + lambda m: m.impl_t_t("foo", "CPU"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -382,12 +382,12 @@ def test_computed_table_with_cpu_catchall(self): ''') def test_computed_table_with_math(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo(Tensor x) -> Tensor") lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "math"), + lambda m: m.impl_t_t("foo", "Math"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -412,14 +412,14 @@ def test_computed_table_with_math(self): ''') def test_computed_table_with_cpu_math(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo(Tensor x) -> Tensor") lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "math", debug="fn_math"), + lambda m: m.impl_t_t("foo", "Math", debug="fn_math"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -445,12 +445,12 @@ def test_computed_table_with_cpu_math(self): ''') def test_computed_table_with_autograd(self): - global_m = C._dispatch_library("IMPL", "_", "autogradcpu") + global_m = C._dispatch_library("IMPL", "_", "AutogradCPU") result = self.commute("foo", [ # m.def("foo(Tensor x) -> Tensor") lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd"), + lambda m: m.impl_t_t("foo", "Autograd"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -476,11 +476,11 @@ def test_computed_table_with_cpu_autograd_math_catchall(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"), + lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"), # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "math", debug="fn_math"), + lambda m: m.impl_t_t("foo", "Math", debug="fn_math"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -512,9 +512,9 @@ def test_computed_table_with_cpu_autograd_catchall(self): # m.def("foo", [](const Tensor & x) { return x }) lambda m: m.def_name_t_t("foo"), # m.impl("foo", torch::kCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "cpu", debug="fn_cpu"), + lambda m: m.impl_t_t("foo", "CPU", debug="fn_cpu"), # m.impl("foo", torch::kAutograd, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "autograd", debug="fn_autograd"), + lambda m: m.impl_t_t("foo", "Autograd", debug="fn_autograd"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -538,6 +538,39 @@ def test_computed_table_with_cpu_autograd_catchall(self): AutogradCPU: fn_autograd [autograd kernel] AutogradCUDA: fn_autograd [autograd kernel] AutogradXLA: fn_autograd [autograd kernel] +''') + + def test_computed_table_with_ambiguous_autogradother(self): + result = self.commute("foo", [ + # m.def("foo", [](const Tensor & x) { return x }) + lambda m: m.def_name_t_t("foo"), + # m.impl("foo", torch::kMath, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "Math", debug="fn_math"), + # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"), + ]) + state, table = result.state, result.table + self.assertExpectedInline(state, '''\ +name: test::foo +schema: test::foo(Tensor _0) -> (Tensor _0) +debug: registered at /dev/null:0 +alias analysis kind: CONSERVATIVE +QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +Math[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +catchall: default_def_name_t_t :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +''') + + # computed dispatch table is too big, so we only check on a few entries we're interested in. + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check) + + self.assertExpectedInline(extracted_table, '''\ +CPU: fn_math [math kernel] +CUDA: fn_math [math kernel] +XLA: fn_math [math kernel] +AutogradOther: ambiguous_autogradother [ambiguous autogradother] +AutogradCPU: fn_math [math kernel] +AutogradCUDA: fn_math [math kernel] +AutogradXLA: fn_math [math kernel] ''') # Can't do this yet for BC reasons @@ -631,7 +664,7 @@ def test_multiple_def_alias_mismatch(self): ) def test_multiple_fallback(self): - global_m = C._dispatch_library("IMPL", "_", "xla") + global_m = C._dispatch_library("IMPL", "_", "XLA") global_m.fallback_fallthrough(), try: global_m.fallback_fallthrough(), diff --git a/test/test_foreach.py b/test/test_foreach.py index 8369ba5b9be5..683b4fe28167 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -1,24 +1,47 @@ import torch import unittest -from torch.testing._internal.common_utils import TestCase, run_tests +from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_ROCM from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes, skipCUDAIfRocm class TestForeach(TestCase): - bin_ops = [ + foreach_bin_ops = [ torch._foreach_add, - torch._foreach_add_, torch._foreach_sub, - torch._foreach_sub_, torch._foreach_mul, - torch._foreach_mul_, torch._foreach_div, + ] + + foreach_bin_ops_ = [ + torch._foreach_add_, + torch._foreach_sub_, + torch._foreach_mul_, torch._foreach_div_, ] + foreach_bin_ops_sl = [ + torch._foreach_add_scalar_list, + torch._foreach_sub_scalar_list, + torch._foreach_mul_scalar_list, + torch._foreach_div_scalar_list, + ] + + foreach_bin_ops_sl_ = [ + torch._foreach_add_scalar_list_, + torch._foreach_sub_scalar_list_, + torch._foreach_mul_scalar_list_, + torch._foreach_div_scalar_list_, + ] + + torch_bin_ops = [ + torch.add, + torch.sub, + torch.mul, + torch.div, + ] + def _get_test_data(self, device, dtype, N): if dtype in [torch.bfloat16, torch.bool, torch.float16]: tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)] - elif dtype in torch.testing.get_all_int_dtypes(): tensors = [torch.randint(1, 100, (N, N), device=device, dtype=dtype) for _ in range(N)] else: @@ -26,50 +49,83 @@ def _get_test_data(self, device, dtype, N): return tensors - def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - - expected = [torch_op(tensors1[i], tensors2[i]) for i in range(N)] - res = foreach_op(tensors1, tensors2) - foreach_op_(tensors1, tensors2) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, expected) - - def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors1 = self._get_test_data(device, dtype, N) - expected = [torch_op(tensors1[i]) for i in range(N)] - res = foreach_op(tensors1) - foreach_op_(tensors1) - self.assertEqual(res, tensors1) - self.assertEqual(tensors1, expected) - - def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors = self._get_test_data(device, dtype, N) - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - value = 2 - - expected = [torch_op(tensors[i], tensors1[i], tensors2[i], value=value) for i in range(N)] - - res = foreach_op(tensors, tensors1, tensors2, value) - foreach_op_(tensors, tensors1, tensors2, value) - self.assertEqual(res, tensors) - self.assertEqual(tensors, expected) - - def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op, N=20): - tensors1 = self._get_test_data(device, dtype, N) - tensors2 = self._get_test_data(device, dtype, N) - alpha = 2 - - expected = [torch_op(tensors1[i], torch.mul(tensors2[i], alpha)) for i in range(N)] - res = foreach_op(tensors1, tensors2, alpha) - foreach_op_(tensors1, tensors2, alpha) - self.assertEqual(res, tensors1) - - if dtype == torch.bool: - expected = [e.to(torch.bool) for e in expected] - self.assertEqual(tensors1, expected) + def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + + # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_op(tensors1[i].to(dtype=control_dtype), + tensors2[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)] + res = foreach_op(tensors1, tensors2) + foreach_op_(tensors1, tensors2) + self.assertEqual(res, tensors1) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors1, expected) + + def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)] + res = foreach_op(tensors1) + foreach_op_(tensors1) + self.assertEqual(res, tensors1) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors1, expected) + + def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors = self._get_test_data(device, dtype, N) + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + value = 2 + + # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_op(tensors[i].to(dtype=control_dtype), + tensors1[i].to(dtype=control_dtype), + tensors2[i].to(dtype=control_dtype), value=value).to(dtype=dtype) for i in range(N)] + + res = foreach_op(tensors, tensors1, tensors2, value) + foreach_op_(tensors, tensors1, tensors2, value) + self.assertEqual(res, tensors) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors, expected) + + def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + tensors2 = self._get_test_data(device, dtype, N) + alpha = 2 + + # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_op(tensors1[i].to(dtype=control_dtype), + torch.mul(tensors2[i].to(dtype=control_dtype), + alpha)).to(dtype=dtype) for i in range(N)] + res = foreach_op(tensors1, tensors2, alpha=alpha) + foreach_op_(tensors1, tensors2, alpha=alpha) + self.assertEqual(res, tensors1) + + if dtype == torch.bool: + expected = [e.to(torch.bool) for e in expected] + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors1, expected) # # Unary ops @@ -88,7 +144,7 @@ def test_exp(self, device, dtype): @skipCUDAIfRocm @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) def test_addcmul(self, device, dtype): - if device == 'cpu': + if self.device_type == 'cpu': if dtype == torch.half: with self.assertRaisesRegex(RuntimeError, r"\"addcmul_cpu_out\" not implemented for \'Half\'"): self._test_pointwise_op(device, dtype, torch._foreach_addcmul, @@ -105,7 +161,7 @@ def test_addcdiv(self, device, dtype): self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, torch._foreach_addcdiv_, torch.addcdiv) return - if device == 'cpu': + if self.device_type == 'cpu': if dtype == torch.half: with self.assertRaisesRegex(RuntimeError, r"\"addcdiv_cpu_out\" not implemented for \'Half\'"): self._test_pointwise_op(device, dtype, torch._foreach_addcdiv, @@ -118,83 +174,398 @@ def test_addcdiv(self, device, dtype): # @dtypes(*torch.testing.get_all_dtypes()) def test_int_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - int_scalar = 1 - - # bool tensor + 1 will result in int64 tensor - if dtype == torch.bool: - expected = [torch.ones(10, 10, device=device, dtype=torch.int64) for _ in range(10)] - else: - expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] - - res = torch._foreach_add(tensors, int_scalar) - self.assertEqual(res, expected) - - if dtype in [torch.bool]: - with self.assertRaisesRegex(RuntimeError, - "result type Long can't be cast to the desired output type Bool"): - torch._foreach_add_(tensors, int_scalar) - else: - torch._foreach_add_(tensors, int_scalar) - self.assertEqual(res, tensors) + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = 3 + expected = [torch_bin_op(t, scalar) for t in tensors] + + res = foreach_bin_op(tensors, scalar) + + if dtype == torch.bool: + self.assertEqual(res, expected) + + with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + return + + + if foreach_bin_op_ == torch._foreach_div_ and dtype in torch.testing.integral_types() and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, + "can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + return + + # TODO[type promotion]: Fix once type promotion is enabled. + if dtype in torch.testing.integral_types() and self.device_type == 'cuda': + self.assertEqual(res, [e.to(dtype) for e in expected]) + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, expected) + + # TODO[Fix scalar list]: + # We need to update codegen to correctly handle function overloads with float[] and int[]. + # As optimizers work with float tensors, the result will always be torch.float32 for now. + # Current schema is using 'float[]' as scalar list type. + @dtypes(*torch.testing.get_all_dtypes()) + def test_int_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl, + self.foreach_bin_ops_sl_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [1 for _ in range(N)] + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + + # we dont support bool and complex types on CUDA for now + if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + return + + res = foreach_bin_op(tensors, scalars) + + if dtype == torch.bool: + self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)]) + + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + + if dtype in torch.testing.integral_types(): + if self.device_type == 'cpu': + self.assertEqual(res, [e.to(torch.float32) for e in expected]) + else: + # TODO[type promotion]: Fix once type promotion is enabled. + self.assertEqual(res, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types() and self.device_type == 'cpu': + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + else: + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_float_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - float_scalar = 1. - - # float scalar + integral tensor will result in float tensor - if dtype in [torch.uint8, torch.int8, torch.int16, - torch.int32, torch.int64, torch.bool]: - expected = [torch.ones(10, 10, device=device, dtype=torch.float32) for _ in range(10)] - else: - expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] - - res = torch._foreach_add(tensors, float_scalar) - self.assertEqual(res, expected) + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = 3.3 + + # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_bin_op(t.to(dtype=control_dtype), + scalar) for t in tensors] + if (dtype is torch.float16 or dtype is torch.bfloat16): + expected = [e.to(dtype=dtype) for e in expected] + + if dtype == torch.bool: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op(tensors, scalar) + return + + res = foreach_bin_op(tensors, scalar) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types(): + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + return + + foreach_bin_op_(tensors, scalar) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors, expected) - if dtype in [torch.uint8, torch.int8, torch.int16, - torch.int32, torch.int64, torch.bool]: - self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, float_scalar)) - else: - torch._foreach_add_(tensors, float_scalar) - self.assertEqual(res, tensors) + @dtypes(*torch.testing.get_all_dtypes()) + def test_float_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl, + self.foreach_bin_ops_sl_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [1.1 for _ in range(N)] + + # If incoming dtype is float16 or bfloat16, runs in float32 and casts output back to dtype. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_bin_op(t.to(dtype=control_dtype), + s) for t, s in zip(tensors, scalars)] + if (dtype is torch.float16 or dtype is torch.bfloat16): + expected = [e.to(dtype=dtype) for e in expected] + + # we dont support bool and complex types on CUDA for now + if (dtype in torch.testing.get_all_complex_dtypes() or dtype == torch.bool) and self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + return + + res = foreach_bin_op(tensors, scalars) + + if dtype == torch.bool: + # see TODO[Fix scalar list] + self.assertEqual(res, [torch_bin_op(t.to(torch.float32), s) for t, s in zip(tensors, scalars)]) + + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + + if dtype in torch.testing.integral_types() and self.device_type == 'cuda': + # see TODO[Fix scalar list] + self.assertEqual(res, [e.to(dtype) for e in expected]) + + foreach_bin_op_(tensors, scalars) + self.assertEqual(tensors, res) + return + else: + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types() and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalars) + return + + foreach_bin_op_(tensors, scalars) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors, expected) @dtypes(*torch.testing.get_all_dtypes()) def test_complex_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - complex_scalar = 3 + 5j + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = 3 + 5j + expected = [torch_bin_op(t, scalar) for t in tensors] + + if dtype == torch.bool: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op(tensors, scalar) + return + + if dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=True) and \ + self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "value cannot be converted to type"): + foreach_bin_op(tensors, scalar) + return + + res = foreach_bin_op(tensors, scalar) + self.assertEqual(res, expected) + + if dtype not in [torch.complex64, torch.complex128]: + with self.assertRaisesRegex(RuntimeError, "can't be cast to the desired output type"): + foreach_bin_op_(tensors, scalar) + else: + foreach_bin_op_(tensors, scalar) + self.assertEqual(res, tensors) - # bool tensor + 1 will result in int64 tensor - expected = [torch.add(complex_scalar, torch.zeros(10, 10, device=device, dtype=dtype)) for _ in range(10)] - - if dtype in [torch.float16, torch.float32, torch.float64, torch.bfloat16] and device == 'cuda:0': - # value cannot be converted to dtype without overflow: - self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) - self.assertRaises(RuntimeError, lambda: torch._foreach_add(tensors, complex_scalar)) - return - - res = torch._foreach_add(tensors, complex_scalar) - self.assertEqual(res, expected) - - if dtype not in [torch.complex64, torch.complex128]: - self.assertRaises(RuntimeError, lambda: torch._foreach_add_(tensors, complex_scalar)) - else: - torch._foreach_add_(tensors, complex_scalar) - self.assertEqual(res, tensors) + @dtypes(*torch.testing.get_all_dtypes()) + def test_complex_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl, + self.foreach_bin_ops_sl_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [3 + 5j for _ in range(N)] + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + + if dtype == torch.bool: + if foreach_bin_op == torch._foreach_sub: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op_(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with two bool"): + foreach_bin_op(tensors, scalar) + return + + with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"): + res = foreach_bin_op(tensors, scalars) + + with self.assertRaisesRegex(TypeError, "argument 'scalars' must be tuple of floats"): + foreach_bin_op_(tensors, scalars) @dtypes(*torch.testing.get_all_dtypes()) def test_bool_scalar(self, device, dtype): - tensors = [torch.zeros(10, 10, device=device, dtype=dtype) for _ in range(10)] - bool_scalar = True - - expected = [torch.ones(10, 10, device=device, dtype=dtype) for _ in range(10)] - - res = torch._foreach_add(tensors, bool_scalar) - self.assertEqual(res, expected) + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops, + self.foreach_bin_ops_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalar = True + + if dtype == torch.bool: + expected = [torch_bin_op(t, scalar) for t in tensors] + res = foreach_bin_op(tensors, scalar) + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, res) + return + + if foreach_bin_op == torch._foreach_sub and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"): + res = foreach_bin_op(tensors, scalar) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator"): + foreach_bin_op_(tensors, scalar) + elif foreach_bin_op == torch._foreach_sub and self.device_type == 'cuda': + res = foreach_bin_op(tensors, scalar) + self.assertEqual(res, foreach_bin_op(tensors, 1)) + + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, res) + else: + expected = [torch_bin_op(t, scalar) for t in tensors] + res = foreach_bin_op(tensors, scalar) + + # TODO[type promotion]: Fix once type promotion is enabled. + if dtype in torch.testing.integral_types() and self.device_type == 'cuda': + self.assertEqual(res, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types(): + if foreach_bin_op == torch._foreach_div and self.device_type == "cpu": + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "): + foreach_bin_op_(tensors, scalar) + else: + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, res) + else: + foreach_bin_op_(tensors, scalar) + self.assertEqual(tensors, expected) - torch._foreach_add_(tensors, bool_scalar) - self.assertEqual(res, tensors) + @dtypes(*torch.testing.get_all_dtypes()) + def test_bool_scalarlist(self, device, dtype): + for N in [30, 300]: + for foreach_bin_op, foreach_bin_op_, torch_bin_op in zip(self.foreach_bin_ops_sl, + self.foreach_bin_ops_sl_, + self.torch_bin_ops): + tensors = self._get_test_data(device, dtype, N) + scalars = [True for _ in range(N)] + + if dtype == torch.bool: + if self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + return + else: + if foreach_bin_op == torch._foreach_sub_scalar_list: + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "Subtraction, the `-` operator, with a bool tensor"): + foreach_bin_op(tensors, scalars) + else: + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired"): + foreach_bin_op_(tensors, scalars) + + res = foreach_bin_op(tensors, scalars) + for r in res: + self.assertTrue(r.dtype == torch.float32) + else: + # we dont support bool and complex types on CUDA for now + if (dtype in torch.testing.get_all_complex_dtypes()) and self.device_type == 'cuda': + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op_(tensors, scalars) + + with self.assertRaisesRegex(RuntimeError, "not implemented for"): + foreach_bin_op(tensors, scalars) + return + + if foreach_bin_op == torch._foreach_sub_scalar_list: + if self.device_type == "cpu": + # see TODO[Fix scalar list] + res = foreach_bin_op(tensors, scalars) + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [r.to(torch.float32) for r in [torch_bin_op(t, 1) for t in tensors]]) + + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the "): + foreach_bin_op_(tensors, scalars) + else: + self.assertEqual(res, [torch_bin_op(t, 1) for t in tensors]) + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) + else: + # see TODO[Fix scalar list] + res = foreach_bin_op(tensors, scalars) + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [r.to(dtype) for r in [torch_bin_op(t, 1) for t in tensors]]) + else: + self.assertEqual(res, [torch_bin_op(t, 1) for t in tensors]) + + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) + else: + if self.device_type == "cpu": + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + res = foreach_bin_op(tensors, scalars) + + # see TODO[Fix scalar list] + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [e.to(torch.float32) for e in expected]) + else: + self.assertEqual(res, expected) + + if dtype in torch.testing.integral_types(): + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to the desired "): + foreach_bin_op_(tensors, scalars) + else: + foreach_bin_op_(tensors, scalars) + self.assertEqual(tensors, expected) + else: + expected = [torch_bin_op(t, s) for t, s in zip(tensors, scalars)] + res = foreach_bin_op(tensors, scalars) + + if dtype in torch.testing.integral_types(): + self.assertEqual(res, [e.to(dtype) for e in expected]) + else: + self.assertEqual(res, expected) + + foreach_bin_op_(tensors, scalars) + self.assertEqual(res, tensors) @dtypes(*torch.testing.get_all_dtypes()) def test_add_with_different_size_tensors(self, device, dtype): @@ -318,13 +689,25 @@ def test_div_list(self, device, dtype): self.skipTest("Skipped! See https://github.com/pytorch/pytorch/issues/44489") return - self._test_bin_op_list(device, dtype, torch._foreach_div, torch._foreach_div_, torch.div) + for N in [30, 300]: + tensors1 = self._get_test_data(device, dtype, N) + + if dtype in [torch.bfloat16, torch.bool, torch.float16]: + tensors2 = [torch.zeros(N, N, device=device, dtype=dtype).add(2) for _ in range(N)] + else: + tensors2 = self._get_test_data(device, dtype, N) + + expected = [torch.div(tensors1[i], tensors2[i]) for i in range(N)] + res = torch._foreach_div(tensors1, tensors2) + torch._foreach_div_(tensors1, tensors2) + self.assertEqual(res, tensors1) + self.assertEqual(tensors1, res) def test_bin_op_list_error_cases(self, device): tensors1 = [] tensors2 = [] - for bin_op in self.bin_ops: + for bin_op in self.foreach_bin_ops + self.foreach_bin_ops_: # Empty lists with self.assertRaises(RuntimeError): bin_op(tensors1, tensors2) diff --git a/test/test_function_schema.py b/test/test_function_schema.py index f2ad2290d326..5a1527373478 100644 --- a/test/test_function_schema.py +++ b/test/test_function_schema.py @@ -14,90 +14,77 @@ def test_serialize_and_deserialize(self): self.assertEqual(parsed_schema, schema) self.assertTrue(parsed_schema.is_backward_compatible_with(schema)) - def test_backward_compatible_args(self): - old_schema = parse_schema('any(Tensor self, int dim) -> Tensor') - new_schema = parse_schema('any(Tensor self, int? dim) -> Tensor') + def test_backward_compatible_structure(self): + old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + # BC: A new schema without changes. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim=5) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_compatible_kwargs(self): - old_schema = parse_schema('any(Tensor self, *, Tensor out) -> Tensor') - new_schema = parse_schema('any(Tensor self, *, bool extra1=True, Tensor out, bool extra2=False) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, Tensor out) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_compatible_ret(self): - old_schema = parse_schema('any(Tensor self) -> Tensor?') - new_schema = parse_schema('any(Tensor self) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_name(self): - old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any_(Tensor self, int dim, bool keepdim=False) -> Tensor') + # No-BC: A new schema with different name. + new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_vararg(self): - old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False, ...) -> Tensor') + # No-BC: A new schema with different overload name. + new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_returns(self): - old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, ...)') + # No-BC: A new schema that adds vararg. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> int') + # No-BC: A new schema with different number of outputs. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor?') + + def test_backward_compatible_outputs(self): + old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + # No-BC: A new schema with output becoming of optional type. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + # BC: (the opposite case) An schema where the output is not of optional type anymore. self.assertTrue(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)') + # No-BC: A new schema with a different output type. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor out') + # No-BC: A new schema with a different output type. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - def test_backward_incompatible_args(self): - old_schema = parse_schema('any(Tensor self, int[] dims, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor s, int[] dims, bool keepdim=False) -> Tensor') + def test_backward_compatible_arguments(self): + old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor') + # No-BC: A new schema with less arguments. + new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[3] dims, bool keepdim=False) -> Tensor') + # No-BC: A new schema with more arguments, appended, but no default value. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[](a) dims, bool keepdim=False) -> Tensor') + # BC: A new schema with more arguments, appended, that have a default value. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # No-BC: A new schema with more arguments, not-appended, that have a default value. + new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + # BC: A new schema where old kwargs becomes positional. + new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # BC: (the opposite case) A new schema where an old positional argument becomes kwarg. self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dims, bool keepdim=False) -> Tensor') - self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + # BC: A new schema where all old kwargs become positional. + new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # BC: (the opposite case) A new schema where all old positional arguments become kwarg. self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[] dim, bool keepdim=False, bool? extra=None) -> Tensor') + # No-BC: A new schema where old kwargs appear in different order. + new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_kwargs(self): - old_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim) -> Tensor') + # BC: A new schema where argument becomes of type optional. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # BC: A new schema where argument gains a default value. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # No-BC: A new schema where argument is "renamed". + new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertTrue(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False, bool extra) -> Tensor') + # No-BC: A new schema where argument type changes to an incompatible type. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - if __name__ == '__main__': run_tests() diff --git a/test/test_fx.py b/test/test_fx.py index 41607d64cbcc..1451c5efe5cb 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -6,6 +6,10 @@ import copy from pathlib import Path from torch.fx import symbolic_trace, Proxy, Node, GraphModule, Tracer, Graph +from torch.fx.experimental import GraphManipulation +from torch.fx.experimental import shape_prop +from torch.fx.experimental.Partitioner import DAG, Partitioner +from torch.fx.experimental.subgraph_creation_example import split_module from torch.fx.proxy import TraceError @@ -26,6 +30,9 @@ class SimpleTest(torch.nn.Module): def forward(self, x): return torch.relu(x + 3.0) +def a_non_torch_leaf(a, b): + return a + b + class TestFX(JitTestCase): def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): """Check that an nn.Module's results match the GraphModule version @@ -34,6 +41,7 @@ def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): kwargs = kwargs if kwargs else {} ref_outs = m(*args, **kwargs) gm = symbolic_trace(m) + gm.graph.lint(gm) test_outs = gm(*args, **kwargs) self.assertEqual(ref_outs, test_outs) @@ -79,6 +87,17 @@ def forward(self, A, b=4, *args, c=5, **kwargs): t = T() symbolic_trace(t) + def test_custom_import(self): + graph = torch.fx.Graph() + a = graph.placeholder('x') + b = graph.placeholder('y') + c = graph.call_function(a_non_torch_leaf, (a, b)) + d = graph.call_function(torch.sin, (c,)) + graph.output(d) + gm = GraphModule(torch.nn.Module(), graph) + x, y = torch.rand(1), torch.rand(1) + self.assertEqual(torch.sin(x + y), gm(x, y)) + def test_args_kwargs(self): class T(torch.nn.Module): def forward(self, *args, **kwargs): @@ -165,8 +184,9 @@ def forward(self, x): mrm = MyReluMod() sym = NoLeafModulesTracer().trace(mrm) - for node in sym.graph.nodes: + for node in sym.nodes: self.assertNotEqual(node.op, 'call_module') + sym.lint(sym) def test_graph_edit_with_proxy(self): class M(torch.nn.Module): @@ -174,12 +194,49 @@ def forward(self, a, b): return a + b m = M() g = symbolic_trace(m).graph - t = Proxy(g.result) + new_g = torch.fx.Graph() + val_map : Dict[Node, Node] = {} + output_val = new_g.graph_copy(g, val_map) + t = Proxy(output_val) # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules. - g.output((t + t).node) - gm = GraphModule(m, g) + new_g.output((t + t).node) + gm = GraphModule(m, new_g) + gm.graph.lint(gm) self.assertEqual(gm(3, 4), 14) + def test_graph_unique_names(self): + class M(torch.nn.Module): + def forward(self, a, b): + return a + b + m = M() + g = symbolic_trace(m).graph + new_g = torch.fx.Graph() + val_map : Dict[Node, Node] = {} + output_val = new_g.graph_copy(g, val_map) + t = Proxy(output_val) + # test that we can use proxy objects to generate more graph code later for things that do not need to work with modules. + new_g.output((t + t).node) + gm = GraphModule(m, new_g) + seen_names : Set[str] = set() + for node in gm.graph.nodes: + assert node.name not in seen_names + seen_names.add(node.name) + + def test_graph_unique_names_manual(self): + graph : torch.fx.Graph = torch.fx.Graph() + a : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_module', 'linear_mod', args=(a,), name='foo_1_1') + c : torch.fx.Node = graph.create_node('get_attr', 'y_attr', name='foo_1') + d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c)) + graph.output(d) + graph2 = torch.fx.Graph() + val_map : Dict[Node, Node] = {} + graph2.graph_copy(graph, val_map) + seen_names : Set[str] = set() + for node in graph2.nodes: + assert node.name not in seen_names + seen_names.add(node.name) + @skipIfNoTorchVision def test_resnet(self): resnet = resnet18() @@ -202,6 +259,7 @@ def test_resnet(self): quantizer.observe((torch.rand(1, 3, 224, 224),)) qgraph = quantizer.quantize() + qgraph.graph.lint(qgraph) qgraph_script = torch.jit.script(qgraph) d = qgraph(ip) @@ -271,6 +329,7 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod operator.mul : "mul" } + output_node : Optional[Node] = None # For each instruction, create a triple # (instruction_name : str, inputs : List[str], output : str) # to feed into the C++ interpreter @@ -297,9 +356,12 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod else: arg_names.append(arg.name) instructions.append((target_to_name[target], arg_names, out_name)) - + elif n.op == 'output': + if output_node is not None: + raise RuntimeError('Multiple output nodes!') + output_node = n else: - raise RuntimeError('Unsupported opcode' + n.op) + raise RuntimeError('Unsupported opcode ' + n.op) interpreter = torch.classes._TorchScriptTesting._ElementwiseInterpreter() # Load constants @@ -310,7 +372,8 @@ def lower_to_elementwise_interpreter(orig_mod : torch.nn.Module) -> torch.nn.Mod # Load instructions interpreter.set_instructions(instructions) # Specify name for single output - interpreter.set_output_name(mod.graph.result.name) + assert isinstance(output_node.args[0], torch.fx.Node) + interpreter.set_output_name(output_node.args[0].name) # ===== Stage 3: Create a wrapper GraphModule around the interpreter ===== class WrapperModule(torch.nn.Module): @@ -345,6 +408,8 @@ def __init__(self, interpreter): # Register output graph.output(output_node) + graph.lint(wrapper) + # Return final GraphModule!!! return GraphModule(wrapper, graph) @@ -376,6 +441,7 @@ def forward(self, a): m = M() m_g = symbolic_trace(m) + m_g.graph.lint(m_g) for node in m_g.graph.nodes: self.assertTrue(node.name != "getattr") @@ -392,7 +458,8 @@ def forward(self, a, b): return a + b m = M() - g = TaggingTracer().trace(m).graph + g = TaggingTracer().trace(m) + g.lint(m) for n in g.nodes: self.assertTrue(hasattr(n, 'tag')) self.assertEqual(n.tag, 'foo') @@ -420,6 +487,7 @@ def forward(self, x): wfq = WrapperForQualname() traced2 = symbolic_trace(wfq) + traced2.graph.lint(traced2) traced2(torch.rand(4, 4)) def test_symbolic_trace_sequential(self): @@ -433,6 +501,7 @@ def forward(self, x): Simple() ) traced = symbolic_trace(seq) + traced.graph.lint(traced) x = torch.rand(3, 4) self.assertEqual(traced(x), seq(x)) @@ -443,6 +512,7 @@ def forward(self, x): ct = ConstTensor() traced = symbolic_trace(ct) + traced.graph.lint(traced) traced(torch.rand(4, 4)) def test_pickle_graphmodule(self): @@ -456,22 +526,28 @@ def forward(self, x): n = Nested() traced = symbolic_trace(n) + traced.graph.lint(traced) pickled = pickle.dumps(traced) loaded = pickle.loads(pickled) + loaded.graph.lint(loaded) x = torch.rand(3, 4) self.assertEqual(loaded(x), traced(x)) def test_deepcopy_graphmodule_with_transform(self): st = SimpleTest() traced = symbolic_trace(st) + traced.graph.lint(traced) def transform(traced): - new_graph = copy.deepcopy(traced.graph) + new_graph = torch.fx.Graph() + val_map : Dict[Node, Node] = {} + output_value = new_graph.graph_copy(traced.graph, val_map) relu_out = new_graph.create_node( - op='call_method', target='neg', args=(new_graph.result,), kwargs={}) + op='call_method', target='neg', args=(output_value,), kwargs={}) new_graph.output(relu_out) return GraphModule(traced, new_graph) transformed = transform(traced) + transformed.graph.lint(transformed) copied = copy.deepcopy(transformed) self.assertNotEqual(id(type(transformed)), id(type(copied))) x = torch.randn(3, 4) @@ -497,7 +573,9 @@ def forward(self, x): baz = Baz() traced = symbolic_trace(baz) + traced.graph.lint(traced) copied = copy.deepcopy(traced) + copied.graph.lint(copied) def test_unpack_list_better_error(self): class SomeArgs(torch.nn.Module): @@ -543,12 +621,32 @@ def forward(self, a): input = torch.randn(3) ref_out = m(input) gm = symbolic_trace(m) + gm.graph.lint(gm) out = gm(input) self.assertEqual(out, ref_out) + def test_replace_target_nodes_with(self): + class testModule(torch.nn.Module): + def forward(self, a, b): + return a + b + m = testModule() + traced = symbolic_trace(m) + input1 = torch.randn(1) + input2 = torch.randn(1) + assert (input1 + input2) == traced(input1, input2) + GraphManipulation.replace_target_nodes_with( + fx_module=traced, + old_op="call_function", + old_target=operator.add, + new_op="call_function", + new_target=operator.mul, + ) + assert (input1 * input2) == traced(input1, input2) + def test_pretty_print(self): st = SimpleTest() traced = symbolic_trace(st) + traced.graph.lint(traced) printed = str(traced) assert 'GraphModuleImpl()' in printed assert 'torch.relu' in printed @@ -559,6 +657,7 @@ def forward(self, x): return torch.squeeze(x + 3.0, dim=2) st = KwargPrintTest() traced = symbolic_trace(st) + traced.graph.lint(traced) stringed = str(traced.graph) for s in ['args', 'kwargs', 'uses']: assert s in stringed @@ -575,6 +674,7 @@ def test_graph_fns(self): mod.linear = torch.nn.Linear(3, 4) mod.bias = torch.rand(4) gm = GraphModule(mod, g) + gm.graph.lint(gm) input = torch.rand(3) r = gm(input) ref = torch.sin(mod.linear(input) + mod.bias) @@ -592,6 +692,7 @@ def test_construct_root_dict(self): add_param : torch.Tensor = torch.rand(3, 4) gm : torch.fx.GraphModule = torch.fx.GraphModule( {'foo.bar.baz': linear_mod, 'zip.zap.zam' : add_param}, graph) + gm.graph.lint(gm) assert 'self.foo.bar.baz' in gm.code @@ -600,6 +701,252 @@ def test_construct_root_dict(self): ref_out : torch.Tensor = linear_mod(x) + add_param self.assertEqual(out, ref_out) + def test_symbolic_trace_assert(self): + message = "assert_foobar" + + class AssertsTensorShape(torch.nn.Module): + def forward(self, x): + torch.Assert(x.shape[1] > 4, message) + return x + + m = AssertsTensorShape() + # verify traceability + traced = symbolic_trace(m) + # verify assertion on traced model works correctly at runtime + traced(torch.rand(4, 5)) + with self.assertRaisesRegex(AssertionError, message): + traced(torch.rand(4, 3)) + + def test_get_all_users_of(self): + graph : torch.fx.Graph = torch.fx.Graph() + a : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_module', 'linear_mod', args=(a,)) + c : torch.fx.Node = graph.create_node('get_attr', 'y_attr') + d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c)) + graph.output(d) + linear_mod : torch.nn.Module = torch.nn.Linear(3, 4) + add_param : torch.Tensor = torch.rand(3, 4) + gm : torch.fx.GraphModule = torch.fx.GraphModule( + {'linear_mod': linear_mod, 'y_attr' : add_param}, graph) + expected_uses: Dict[int, List[int]] = { + 0: [1], + 1: [3], + 2: [3], + 3: [4], + 4: [], + } + for i, node in enumerate(graph.nodes): + user_indexes = GraphManipulation.get_all_users_of(gm, i) + assert user_indexes == expected_uses[i] + + def test_copy_no_remap(self): + traced = symbolic_trace(SimpleTest()) + g = traced.graph + copied = torch.fx.Graph() + for node in g.nodes: + copied.node_copy(node) + with self.assertRaisesRegex(RuntimeError, 'does not belong to this Graph'): + copied.lint() + + def test_wrong_topo(self): + graph : torch.fx.Graph = torch.fx.Graph() + a : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_module', 'foo.bar.baz', args=(a,)) + c : torch.fx.Node = graph.create_node('get_attr', 'zip.zap.zam') + d : torch.fx.Node = graph.create_node('call_function', operator.add, args=(b, c)) + graph.output(d) + nodes = graph._nodes + nodes[2], nodes[3] = nodes[3], nodes[2] + with self.assertRaisesRegex(RuntimeError, 'was used before it has been defined'): + graph.lint() + + def test_example_shape_prop(self): + class TestCase(torch.nn.Module): + def __init__(self): + super().__init__() + self.attr = torch.randn(3, 4) + self.submod = torch.nn.Linear(4, 4) + + def forward(self, x): + return torch.neg(self.submod(x.relu() + self.attr)) + tc = TestCase() + tc_traced = symbolic_trace(tc) + ref_out = tc_traced(torch.rand(3, 4)) + shape_prop.ShapeProp(tc_traced).propagate(torch.rand(3, 4)) + + # Make sure we're testing all opcodes + opcodes = set() + output_shape : Optional[torch.Shape] = None + for node in tc_traced.graph.nodes: + opcodes.add(node.op) + if node.op == 'output': + output_shape = node.args[0].shape + self.assertEqual(opcodes, set(['placeholder', 'get_attr', 'call_function', 'call_method', + 'call_module', 'output'])) + + # Test shape propogation and make sure results match actual + self.assertEqual(output_shape, ref_out.shape) + + def test_find_single_partition(self): + class testModule(torch.nn.Module): + def forward(self, a, b): + return a + b + m = testModule() + traced = symbolic_trace(m) + partitioner = Partitioner() + devices = [{"name": "dev_0", "available_mem": float('inf')}] + dag = partitioner.partition_graph(traced, devices) + for node in traced.graph.nodes: + assert node.op == 'output' or node.partition_ids == [1] + nodes = traced.graph.nodes + res_dag = DAG() + res_dag.create_node(0, [], [1], [], []) + res_dag.create_node(1, [0], [], [nodes[0], nodes[1]], [nodes[2]]) + for r, d in zip(res_dag.nodes, dag.nodes): + assert(r.partition_id == d.partition_id) + assert(r.parents == d.parents) + assert(r.children == d.children) + assert(r.input_nodes == d.input_nodes) + assert(r.output_nodes == d.output_nodes) + + def test_subgraph_creation(self): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.param = torch.nn.Parameter(torch.rand(3, 4)) + self.linear = torch.nn.Linear(4, 5) + + def forward(self, x, y): + z = self.linear(x + self.param).clamp(min=0.0, max=1.0) + w = self.linear(y).clamp(min=0.0, max=1.0) + return z + w + + # symbolically trace model + my_module = MyModule() + my_module_traced = symbolic_trace(my_module) + + # random mod partitioning + partition_counter = 0 + NPARTITIONS = 3 + + def mod_partition(node: Node): + nonlocal partition_counter + partition = partition_counter % NPARTITIONS + partition_counter = (partition_counter + 1) % NPARTITIONS + return partition + + # split module in module with submodules + module_with_submodules = split_module(my_module_traced, my_module, mod_partition) + + x = torch.rand(3, 4) + y = torch.rand(3, 4) + + orig_out = my_module_traced(x, y) + submodules_out = module_with_submodules(x, y) + + self.assertEqual(orig_out, submodules_out) + + @skipIfNoTorchVision + def test_replace_uses(self): + rn18 = resnet18() + + class LowerReluTracer(torch.fx.Tracer): + def is_leaf_module(self, m : torch.nn.Module, qualname : str): + if isinstance(m, torch.nn.ReLU): + return False + return super().is_leaf_module(m, qualname) + + rn18_traced = GraphModule(rn18, LowerReluTracer().trace(rn18)) + + to_erase = [] + for node in rn18_traced.graph.nodes: + if node.op == 'call_function' and node.target in [torch.relu, torch.nn.functional.relu]: + kwargs = node.kwargs + # Neg doesn't have in-place + kwargs.pop('inplace') + with torch.fx.graph.insert_before(node): + new_node = rn18_traced.graph.call_function( + the_function=torch.neg, args=node.args, kwargs=node.kwargs) + node.replace_all_uses_with(replace_with=new_node) + to_erase.append(node) + + for node in to_erase: + rn18_traced.graph.erase_node(node) + + def test_insertion_point(self): + graph : torch.fx.Graph = torch.fx.Graph() + x : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,)) + output : torch.fx.Node = graph.output(b) + + with torch.fx.graph.insert_before(b): + neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,)) + _, *relu_args = b.args + b.args = (neg, *relu_args) + + gm = torch.fx.GraphModule(torch.nn.Module(), graph) + + input = torch.randn(33, 44) + self.assertEqual(gm(input), torch.relu(torch.neg(input))) + + + def test_move_before(self): + graph : torch.fx.Graph = torch.fx.Graph() + x : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,)) + output : torch.fx.Node = graph.output(b) + + neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,)) + _, *relu_args = b.args + b.args = (neg, *relu_args) + graph.move_node_before(to_move=neg, before=b) + + gm = torch.fx.GraphModule(torch.nn.Module(), graph) + + input = torch.randn(33, 44) + self.assertEqual(gm(input), torch.relu(torch.neg(input))) + + def test_erase_node_error(self): + st = SimpleTest() + traced = symbolic_trace(st) + + for node in traced.graph.nodes: + # Test deleting with uses both in another Node and at the output + if node.target in [operator.add, torch.relu]: + with self.assertRaisesRegex(RuntimeError, 'but it still had .* uses in the graph!'): + traced.graph.erase_node(node) + + def test_find_uses(self): + graph = torch.fx.Graph() + x = torch.fx.Proxy(graph.placeholder('x')) + + y = torch.relu(x) + z = x + x + u = torch.neg(x) + graph.output((y + z + u).node) + graph.lint() + + uses_of_x = x.node.find_uses() + self.assertEqual(len(uses_of_x), 3) + expected_ops = ['relu', 'add', 'neg'] + for node, expected in zip(uses_of_x, expected_ops): + assert expected in node.name + + def test_multi_insert_point(self): + graph = torch.fx.Graph() + x = torch.fx.Proxy(graph.placeholder('x')) + relu = torch.relu(x) + + with torch.fx.graph.insert_before(relu.node): + y = torch.neg(x) + z = torch.tanh(y) + + graph.output((relu.node, z.node)) + graph.lint() + + expected_ops = ['x', 'neg', 'tanh', 'relu'] + for node, expected in zip(graph.nodes, expected_ops): + assert expected in node.name if __name__ == '__main__': run_tests() diff --git a/test/test_jit.py b/test/test_jit.py index b689f76681f7..d093a4b8826e 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -32,6 +32,7 @@ from jit.test_enum import TestEnum # noqa: F401 from jit.test_profiler import TestProfiler # noqa: F401 from jit.test_slice import TestSlice # noqa: F401 +from jit.test_warn import TestWarn # noqa: F401 # Torch from torch import Tensor @@ -1425,7 +1426,7 @@ def test_dropout(self): self.assertEqual(outputs, m(*inputs)) @slowTest - @unittest.skipIf(GRAPH_EXECUTOR == ProfilingMode.SIMPLE, 'Testing differentiable graph') + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, 'Testing differentiable graph') def test_dropout_module_requires_grad(self): with enable_profiling_mode_for_profiling_tests(): class MyModule(torch.nn.Module): @@ -7205,6 +7206,20 @@ def f(x): x = torch.rand(3, 4) self.assertEqual(scripted_f(x), f(x)) + def test_multiline_string_dedents(self): + def foo() -> None: + multiline_string_dedent_1 = """ +This is a string dedent """ + multiline_string_dedent_2 = """ This is a + string dedent """ + multiline_string_dedent_3 = """ + This is a string +dedent """ + multiline_string_dedent_4 = """ This is a string dedent """ + + scripted_foo = torch.jit.script(foo) + self.assertEqual(scripted_foo(), foo()) + # adapted from test in test_torch def test_tensor_to(self): template = dedent(''' @@ -9995,6 +10010,21 @@ def method(self, x): with self.assertRaisesRegex(RuntimeError, "Argument y not provided."): ModuleDefault() + def test_type_inferred_from_empty_annotation(self): + """ + Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true` + """ + @torch.jit.script + def fn(x): + return x + + graph = fn.graph + n = next(graph.inputs()) + self.assertTrue(n.type() == torch._C.TensorType.getInferred()) + + with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"): + fn(1) + def test_script_define_order(self): class M(torch.jit.ScriptModule): diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index 9d61cd5dd157..ac9f054d38c8 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -3,7 +3,7 @@ import torch -from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm +from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, skipIfRocm, TEST_WITH_ROCM from torch.testing._internal.codegen.random_topo_test import runDefaultTestWithSeed from test_jit import JitTestCase, RUN_CUDA @@ -550,9 +550,8 @@ def t(x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) # end-2-end test of permutation & contiguity handling in integration. @@ -595,11 +594,10 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): jit_o = t_jit(x, y) jit_o = t_jit(x, y) o = t(x, y) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - # numerical issues here due to our scheduling. - # can't use `self.assertEqual(oo, jit_oo)` - self.assertTrue(self._compare("comparing output failed", oo, jit_oo, 1e-4)) + self.assertEqual(o.dtype, jit_o.dtype) + # numerical issues here due to our scheduling. + # can't use `self.assertEqual(o, jit_o)` + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -630,6 +628,81 @@ def test_reduction_permutation(self): for perm1 in itertools.permutations(range(len(x))): self._reduction_helper(x, axes, torch.float32, "cuda", perm0, perm1) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + def test_reduction_multiple_output(self): + torch._C._jit_set_bailout_depth(2) + + def t(x: torch.Tensor, y: torch.Tensor, scale: float, z: torch.Tensor): + o = torch.mul(x, y) + o = torch.mul(o, scale) + out1 = torch.mul(o, z) + out2 = torch.sum(out1, dim=[2]) + return out1, out2 + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + y = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + z = torch.randn(8, 4, 10, 16, dtype=torch.float, device="cuda") + scale = 0.5 + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + x = x.to(memory_format=torch.channels_last) + y = y.to(memory_format=torch.channels_last) + z = z.to(memory_format=torch.channels_last) + jit_o = t_jit(x, y, scale, z) + jit_o = t_jit(x, y, scale, z) + o = t(x, y, scale, z) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x, y, scale, z), FUSION_GROUP) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + @skipIfRocm + def test_reduction_dtype(self): + def t(x: torch.Tensor): + o = torch.mul(x, 1.0) + o = torch.sum(o, dim=[2], dtype=torch.float32) + return o + t_jit = torch.jit.script(t) + + x = torch.randn(8, 4, 16, dtype=torch.float, device="cuda") + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != + ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") + @skipIfRocm + def test_reduction_half(self): + def t(x: torch.Tensor): + o = torch.mul(x, 1.0) + o = torch.sum(o, dim=[2]) + return o + + t_jit = torch.jit.script(t) + x = torch.randn(8, 4, 16, dtype=torch.float16, device="cuda") + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, 1e-4)) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GROUP) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING and GRAPH_EXECUTOR != ProfilingMode.LEGACY, "Requires fusion optimization pass to be effective") @@ -651,9 +724,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @@ -676,9 +748,8 @@ def t(x: torch.Tensor, y: torch.Tensor, z: torch.Tensor): jit_o = t_jit(x, y, z) jit_o = t_jit(x, y, z) o = t(x, y, z) - for oo, jit_oo in zip(o, jit_o): - self.assertEqual(oo.dtype, jit_oo.dtype) - self.assertEqual(oo, jit_oo) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GROUP) @@ -731,4 +802,5 @@ def test_register_fuser(self): if __name__ == '__main__': - run_tests() + if not TEST_WITH_ROCM and GRAPH_EXECUTOR != ProfilingMode.PROFILING: + run_tests() diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py index 4b9959c1231e..28ab78370637 100644 --- a/test/test_jit_cuda_fuser_legacy.py +++ b/test/test_jit_cuda_fuser_legacy.py @@ -1,5 +1,11 @@ import sys -sys.argv.append("--ge_config=legacy") +sys.argv.append("--jit_executor=legacy") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py index e2869eca7b5f..5114ab190457 100644 --- a/test/test_jit_cuda_fuser_profiling.py +++ b/test/test_jit_cuda_fuser_profiling.py @@ -1,5 +1,11 @@ import sys -sys.argv.append("--ge_config=profiling") +sys.argv.append("--jit_executor=profiling") + +import os +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' +os.environ['PYTORCH_CUDA_FUSER_DISABLE_FMA'] = '1' +os.environ['PYTORCH_CUDA_FUSER_JIT_OPT_LEVEL'] = '0' + from test_jit_cuda_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_fuser.py b/test/test_jit_fuser.py index a75da03a6d21..b4efbf12c358 100644 --- a/test/test_jit_fuser.py +++ b/test/test_jit_fuser.py @@ -10,6 +10,7 @@ RUN_CUDA, RUN_CUDA_HALF, RUN_CUDA_MULTI_GPU, warmup_backward from textwrap import dedent from itertools import product, permutations +from torch.testing._internal.common_cuda import with_tf32_off from test_jit import backward_graph, all_backward_graphs, get_lstm_inputs, get_milstm_inputs, \ LSTMCellC, LSTMCellF, LSTMCellS, MiLSTMCell @@ -710,6 +711,9 @@ def test_lstm_cuda(self): "aten::_grad_sum_to_size")) @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision. + # We want float tensors to be computed at full precision in order to use the default precision + @with_tf32_off def test_lstm_concat_cuda(self): inputs = get_lstm_inputs('cuda') ge = self.checkTrace(LSTMCellC, inputs) @@ -740,6 +744,9 @@ def cell(x, hx, cx, w_ih, w_hh, b_ih, b_hh): # TODO: Fuser doesn't work at all when inputs require grad. Fix that @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + # By default, on Ampere or later GPUs, LSTM computes float tensors at TF32 precision. + # We want float tensors to be computed at full precision in order to use the default precision + @with_tf32_off def test_lstm_traced_cuda(self): inputs = get_lstm_inputs('cuda') ge = self.checkTrace(LSTMCellF, inputs) diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py index c33983e45e79..420075f6e611 100644 --- a/test/test_jit_fuser_legacy.py +++ b/test/test_jit_fuser_legacy.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=legacy") +sys.argv.append("--jit_executor=legacy") from test_jit_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 453047eca8be..5cb43cbe8079 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1,5 +1,6 @@ from collections import defaultdict +import operator import unittest import contextlib import torch @@ -459,6 +460,121 @@ def func(x): graph = backward_graph(s, skip_check=True) self.assertAllFused(graph, except_for={'aten::div', 'prim::Constant'}) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_add_bool(self): + def f(x, y, z): + return x + y + z + + x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + + ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False) + self.assertAllFused(ge.graph_for(x, y, z)) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_mul_bool(self): + def f(x, y, z): + return x * y * z + + x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + z = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + + ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False) + self.assertAllFused(ge.graph_for(x, y, z)) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_div_bool(self): + def f(x, y, z): + return (x + y) / z + + x = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + y = torch.randint(0, 2, (4, 4), dtype=torch.bool, device='cuda') + z = torch.ones_like(x, dtype=torch.bool, device='cuda') + + ge = self.checkTrace(f, (x, y, z), inputs_require_grads=False) + self.assertAllFused(ge.graph_for(x, y, z)) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_bitwise_ops(self): + def apply(fn): + return lambda x, y, z: fn(fn(x, y), z) + + dtypes = [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + torch.bool, + ] + binary_ops = [ + operator.__and__, + operator.__or__, + operator.__xor__ + ] + devices = ["cuda"] + for dtype, op, device in product(dtypes, binary_ops, devices): + try: + x = self.data_for(dtype, device) + y = self.data_for(dtype, device) + z = self.data_for(dtype, device) + fn = apply(op) + ref = fn(x, y, z) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x, y, z)) + self.assertEqual(ref, t(x, y, z)) + self.assertAllFused(t.graph_for(x, y, z)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device]) + ) + + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_minmax_int_ops(self): + def apply(fn): + return lambda x, y, z: fn(fn(x, y), z) + + dtypes = [ + torch.int8, + torch.uint8, + torch.int16, + torch.int32, + torch.int64, + torch.bool, + ] + binary_ops = [ + torch.min, + torch.max + ] + devices = ["cuda"] + for dtype, op, device in product(dtypes, binary_ops, devices): + try: + x = self.data_for(dtype, device) + y = self.data_for(dtype, device) + z = self.data_for(dtype, device) + fn = apply(op) + ref = fn(x, y, z) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x, y, z)) + self.assertEqual(ref, t(x, y, z)) + self.assertAllFused(t.graph_for(x, y, z)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device]) + ) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") def test_comparison_eq_ne(self): def f(x, y): @@ -567,6 +683,26 @@ def foo(hx, cx): # XXX: TE fuser can handle concats in a fusion group. # FileCheck().check("FusedConcat").check_next("return").run(str(graph)) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_remove_output_used_only_in_size(self): + def test_fuse(a, b): + c = a + b + d = c + b + return d + + scripted_f = torch.jit.script(test_fuse) + x = torch.ones(1, requires_grad=True, device='cuda') + y = torch.ones(1, requires_grad=True, device='cuda') + warmup_forward(scripted_f, x, y) + g = torch.jit.last_executed_optimized_graph() + diff_nodes = [n for n in g.nodes() if n.kind() == 'prim::DifferentiableGraph'] + self.assertEqual(len(diff_nodes), 1) + g = diff_nodes[0].g('Subgraph') + if_nodes = [n for n in g.nodes() if n.kind() == 'prim::If'] + self.assertEqual(len(if_nodes), 1) + # the if node and the fusion group inside it should only have one output + self.assertEqual(len(list(if_nodes[0].outputs())), 1) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") def test_concat_invariant_cuda(self): # Invariant: the output of prim::FusedConcat may @@ -1152,7 +1288,7 @@ def apply(fn): torch.int16, torch.int32, torch.int64, - # torch.float16, + torch.float16, torch.float32, torch.float64, torch.bool, @@ -1234,6 +1370,36 @@ def fn(x): self.assertEqual(ref, t(x)) self.assertEqual(len(self.findFusionGroups(t.graph_for(x))), 0) + @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") + def test_superslomo(self): + # Test extracted from Super-SloMo: https://github.com/avinashpaliwal/Super-SloMo + # A few interesting things happen here: strided inputs of mixed size, + # plus outputs of mixed shapes. The latter characteristic happened to + # expose a memory corruption bug due to not properly guarding the + # outputs. + def eager(t0, t1, t2, t3, t4): + t5 = torch.mul(t0, t4) + t6 = torch.mul(t2, t3) + t7 = torch.mul(t6, t1) + t9 = torch.add(t5, t7) + t11 = torch.add(t0, t6) + ft_p = torch.div(t9, t11) + return (ft_p, t11, t9, t6) + + t0 = torch.rand(1, 6, 352, 352, device="cuda").transpose(0, 1) + t1 = torch.rand(6, 3, 352, 352, device="cuda") + t2 = torch.rand(6, device="cuda")[None, None, None, :].permute(3, 0, 1, 2) + t3 = torch.rand(6, 1, 352, 352, device="cuda") + t4 = torch.rand(6, 3, 352, 352, device="cuda") + inputs = [t0, t1, t2, t3, t4] + + script = torch.jit.script(eager) + for _ in range(4): + for pair in zip(script(*inputs), eager(*inputs)): + test, ref = pair + torch.testing.assert_allclose(test, ref) + self.assertAllFused(script.graph_for(*inputs)) + if __name__ == '__main__': run_tests() diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py index 2422e518a7f9..b17908e910bb 100644 --- a/test/test_jit_legacy.py +++ b/test/test_jit_legacy.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=legacy") +sys.argv.append("--jit_executor=legacy") from test_jit import * if __name__ == '__main__': diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py index be02985e69a8..dc6bb2fbf878 100644 --- a/test/test_jit_profiling.py +++ b/test/test_jit_profiling.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=profiling") +sys.argv.append("--jit_executor=profiling") from test_jit import * if __name__ == '__main__': diff --git a/test/test_jit_py3.py b/test/test_jit_py3.py index 4de5db884035..212b03d9658b 100644 --- a/test/test_jit_py3.py +++ b/test/test_jit_py3.py @@ -621,7 +621,7 @@ def if_function(inp: torch.Tensor) -> Any: def test_module_properties(self): class ModuleWithProperties(torch.nn.Module): - __ignored_properties__ = ["ignored_attr"] + __jit_unused_properties__ = ["ignored_attr"] def __init__(self, a: int): super().__init__() @@ -639,6 +639,15 @@ def attr(self): def ignored_attr(self): return sum([self.a]) + @torch.jit.unused + @property + def ignored_attr_2(self): + return sum([self.a]) + + @ignored_attr_2.setter + def ignored_attr_2(self, value): + self.a = sum([self.a]) + @attr.setter def attr(self, a: int): if a > 0: diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py index 910e4a17713d..23da6602c572 100644 --- a/test/test_jit_simple.py +++ b/test/test_jit_simple.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=simple") +sys.argv.append("--jit_executor=simple") from test_jit import * if __name__ == '__main__': diff --git a/test/test_linalg.py b/test/test_linalg.py index c81b4dc37582..97c7b926faf4 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -1,13 +1,12 @@ import torch import unittest import itertools -import warnings from math import inf, nan, isnan from torch.testing._internal.common_utils import \ - (TestCase, run_tests, TEST_NUMPY) + (TestCase, run_tests, TEST_NUMPY, IS_MACOS, IS_WINDOWS, TEST_WITH_ASAN) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack) + (instantiate_device_type_tests, dtypes, skipCUDAIfNoMagma, skipCPUIfNoLapack, precisionOverride) from torch.testing._internal.jit_metaprogramming_utils import gen_script_fn_and_args from torch.autograd import gradcheck @@ -17,21 +16,143 @@ class TestLinalg(TestCase): exact_dtype = True - # TODO: test out variant - # Tests torch.ger, and its alias, torch.outer, vs. NumPy + # Tests torch.outer, and its alias, torch.ger, vs. NumPy @unittest.skipIf(not TEST_NUMPY, "NumPy not found") - @dtypes(torch.float) + @precisionOverride({torch.bfloat16: 1e-1}) + @dtypes(*(torch.testing.get_all_dtypes())) def test_outer(self, device, dtype): - a = torch.randn(50, device=device, dtype=dtype) - b = torch.randn(50, device=device, dtype=dtype) + def run_test_case(a, b): + if dtype == torch.bfloat16: + a_np = a.to(torch.double).cpu().numpy() + b_np = b.to(torch.double).cpu().numpy() + else: + a_np = a.cpu().numpy() + b_np = b.cpu().numpy() + expected = np.outer(a_np, b_np) + + self.assertEqual(torch.outer(a, b), expected) + self.assertEqual(torch.Tensor.outer(a, b), expected) + + self.assertEqual(torch.ger(a, b), expected) + self.assertEqual(torch.Tensor.ger(a, b), expected) + + # test out variant + out = torch.empty(a.size(0), b.size(0), device=device, dtype=dtype) + torch.outer(a, b, out=out) + self.assertEqual(out, expected) - ops = (torch.ger, torch.Tensor.ger, - torch.outer, torch.Tensor.outer) + out = torch.empty(a.size(0), b.size(0), device=device, dtype=dtype) + torch.ger(a, b, out=out) + self.assertEqual(out, expected) - expected = np.outer(a.cpu().numpy(), b.cpu().numpy()) - for op in ops: - actual = op(a, b) - self.assertEqual(actual, expected) + a = torch.randn(50).to(device=device, dtype=dtype) + b = torch.randn(50).to(device=device, dtype=dtype) + run_test_case(a, b) + + # test 0 strided tensor + zero_strided = torch.randn(1).to(device=device, dtype=dtype).expand(50) + run_test_case(zero_strided, b) + run_test_case(a, zero_strided) + + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") + @precisionOverride({torch.bfloat16: 1e-1}) + @dtypes(*(torch.testing.get_all_dtypes())) + def test_addr(self, device, dtype): + def run_test_case(m, a, b, beta=1, alpha=1): + if dtype == torch.bfloat16: + a_np = a.to(torch.double).cpu().numpy() + b_np = b.to(torch.double).cpu().numpy() + m_np = m.to(torch.double).cpu().numpy() + else: + a_np = a.cpu().numpy() + b_np = b.cpu().numpy() + m_np = m.cpu().numpy() + + if beta == 0: + expected = alpha * np.outer(a_np, b_np) + else: + expected = beta * m_np + alpha * np.outer(a_np, b_np) + + self.assertEqual(torch.addr(m, a, b, beta=beta, alpha=alpha), expected) + self.assertEqual(torch.Tensor.addr(m, a, b, beta=beta, alpha=alpha), expected) + + result_dtype = torch.addr(m, a, b, beta=beta, alpha=alpha).dtype + out = torch.empty_like(m, dtype=result_dtype) + torch.addr(m, a, b, beta=beta, alpha=alpha, out=out) + self.assertEqual(out, expected) + + a = torch.randn(50).to(device=device, dtype=dtype) + b = torch.randn(50).to(device=device, dtype=dtype) + m = torch.randn(50, 50).to(device=device, dtype=dtype) + + # when beta is zero + run_test_case(m, a, b, beta=0., alpha=2) + + # when beta is not zero + run_test_case(m, a, b, beta=0.5, alpha=2) + + # test transpose + m_transpose = torch.transpose(m, 0, 1) + run_test_case(m_transpose, a, b, beta=0.5, alpha=2) + + # test 0 strided tensor + zero_strided = torch.randn(1).to(device=device, dtype=dtype).expand(50) + run_test_case(m, zero_strided, b, beta=0.5, alpha=2) + + # test scalar + m_scalar = torch.tensor(1, device=device, dtype=dtype) + run_test_case(m_scalar, a, b) + + @dtypes(*itertools.product(torch.testing.get_all_dtypes(), + torch.testing.get_all_dtypes())) + def test_outer_type_promotion(self, device, dtypes): + a = torch.randn(5).to(device=device, dtype=dtypes[0]) + b = torch.randn(5).to(device=device, dtype=dtypes[1]) + for op in (torch.outer, torch.Tensor.outer, torch.ger, torch.Tensor.ger): + result = op(a, b) + self.assertEqual(result.dtype, torch.result_type(a, b)) + + @dtypes(*itertools.product(torch.testing.get_all_dtypes(), + torch.testing.get_all_dtypes())) + def test_addr_type_promotion(self, device, dtypes): + a = torch.randn(5).to(device=device, dtype=dtypes[0]) + b = torch.randn(5).to(device=device, dtype=dtypes[1]) + m = torch.randn(5, 5).to(device=device, + dtype=torch.result_type(a, b)) + for op in (torch.addr, torch.Tensor.addr): + # pass the integer 1 to the torch.result_type as both + # the default values of alpha and beta are integers (alpha=1, beta=1) + desired_dtype = torch.result_type(m, 1) + result = op(m, a, b) + self.assertEqual(result.dtype, desired_dtype) + + desired_dtype = torch.result_type(m, 2.) + result = op(m, a, b, beta=0, alpha=2.) + self.assertEqual(result.dtype, desired_dtype) + + # Tests migrated from test_torch.py + # 1) test the shape of the result tensor when there is empty input tensor + # 2) test the Runtime Exception when there is scalar input tensor + def test_outer_ger_addr_legacy_tests(self, device): + for size in ((0, 0), (0, 5), (5, 0)): + a = torch.rand(size[0], device=device) + b = torch.rand(size[1], device=device) + + self.assertEqual(torch.outer(a, b).shape, size) + self.assertEqual(torch.ger(a, b).shape, size) + + m = torch.empty(size, device=device) + self.assertEqual(torch.addr(m, a, b).shape, size) + + m = torch.randn(5, 6, device=device) + a = torch.randn(5, device=device) + b = torch.tensor(6, device=device) + self.assertRaises(RuntimeError, lambda: torch.outer(a, b)) + self.assertRaises(RuntimeError, lambda: torch.outer(b, a)) + self.assertRaises(RuntimeError, lambda: torch.ger(a, b)) + self.assertRaises(RuntimeError, lambda: torch.ger(b, a)) + self.assertRaises(RuntimeError, lambda: torch.addr(m, a, b)) + self.assertRaises(RuntimeError, lambda: torch.addr(m, b, a)) # Tests torch.det and its alias, torch.linalg.det, vs. NumPy @skipCUDAIfNoMagma @@ -56,11 +177,12 @@ def test_det(self, device, dtype): # NOTE: det requires a 2D+ tensor t = torch.randn(1, device=device, dtype=dtype) - with self.assertRaises(IndexError): + with self.assertRaises(RuntimeError): op(t) # This test confirms that torch.linalg.norm's dtype argument works # as expected, according to the function's documentation + @skipCUDAIfNoMagma def test_norm_dtype(self, device): def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype): msg = ( @@ -154,6 +276,7 @@ def run_test_case(input, p, dim, keepdim): # This test compares torch.linalg.norm and numpy.linalg.norm to ensure that # their matrix norm results match + @skipCUDAIfNoMagma @unittest.skipIf(not TEST_NUMPY, "NumPy not found") @dtypes(torch.float, torch.double) def test_norm_matrix(self, device, dtype): @@ -354,9 +477,6 @@ def gen_error_message(input_size, ord, keepdim, dim=None): unsupported_matrix_ords = [ (None, r'norm with p=2 not supported for complex tensors'), ('fro', r'frobenius norm not supported for complex tensors'), - (2, r'"svd_cuda" not implemented for \'Complex'), - (-2, r'"svd_cuda" not implemented for \'Complex'), - ('nuc', r'"svd_cuda" not implemented for \'Complex'), ] # Test supported ords @@ -400,6 +520,8 @@ def gen_error_message(input_size, ord, keepdim, dim=None): # Test that linal.norm gives the same result as numpy when inputs # contain extreme values (inf, -inf, nan) + @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!") + @unittest.skipIf(IS_MACOS, "Skipped on MacOS!") @skipCUDAIfNoMagma @skipCPUIfNoLapack @unittest.skipIf(not TEST_NUMPY, "Numpy not found") @@ -440,14 +562,14 @@ def is_broken_matrix_norm_case(ord, x): result_n = np.linalg.norm(x_n, ord=ord) if is_broken_matrix_norm_case(ord, x): - self.assertNotEqual(result, result_n, msg=msg) + continue else: self.assertEqual(result, result_n, msg=msg) # Test degenerate shape results match numpy for linalg.norm vector norms @skipCUDAIfNoMagma @skipCPUIfNoLapack - @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + @unittest.skipIf(TEST_WITH_ASAN, "Skipped on ASAN since it checks for undefined behavior.") @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_norm_vector_degenerate_shapes(self, device, dtype): def run_test_case(input, ord, dim, keepdim, should_error): @@ -528,18 +650,6 @@ def run_test_case(input, ord, dim, keepdim, should_error): for ord in ord_matrix: run_test_case(input, ord, dim, keepdim, ord in error_ords) - def test_norm_deprecated(self, device): - expected_message = ( - r'torch.norm is deprecated and may be removed in a future PyTorch release. ' - r'Use torch.linalg.norm instead.') - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - for func in [torch.norm, torch.functional.norm]: - func(torch.rand(10, device=device)) - self.assertEqual(len(w), 2) - for wi in w: - self.assertEqual(str(wi.message), expected_message) - def test_norm_fastpaths(self, device): x = torch.randn(3, 5, device=device) diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py index eae6175fb024..11235edac7c0 100644 --- a/test/test_mobile_optimizer.py +++ b/test/test_mobile_optimizer.py @@ -100,8 +100,8 @@ def forward(self, x): torch.testing.assert_allclose(initial_result, optimized_result, rtol=1e-2, atol=1e-3) - optimization_blacklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS} - optimized_scripted_model_no_prepack = optimize_for_mobile(scripted_model, optimization_blacklist_no_prepack) + optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS} + optimized_scripted_model_no_prepack = optimize_for_mobile(scripted_model, optimization_blocklist_no_prepack) optimized_result_no_prepack = optimized_scripted_model_no_prepack(input_data) FileCheck().check_count("Tensor = aten::conv2d", 1, exactly=True) \ @@ -118,19 +118,36 @@ def forward(self, x): FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \ .run(str(get_forward(bn_scripted_module._c).graph)) - optimization_blacklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS} - bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blacklist_no_prepack) + optimization_blocklist_no_prepack = {MobileOptimizerType.INSERT_FOLD_PREPACK_OPS} + bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_prepack) self.assertEqual(len(torch.jit.export_opnames(bn_fold_scripted_module)), 1) bn_input = torch.rand(1, 1, 6, 6) torch.testing.assert_allclose(bn_scripted_module(bn_input), bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3) - optimization_blacklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION} - no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blacklist_no_fold_bn) + optimization_blocklist_no_fold_bn = {MobileOptimizerType.CONV_BN_FUSION} + no_bn_fold_scripted_module = optimize_for_mobile(bn_scripted_module, optimization_blocklist_no_fold_bn) FileCheck().check_count("aten::batch_norm", 1, exactly=True) \ .run(str(get_forward_graph(no_bn_fold_scripted_module._c))) bn_input = torch.rand(1, 1, 6, 6) torch.testing.assert_allclose(bn_scripted_module(bn_input), no_bn_fold_scripted_module(bn_input), rtol=1e-2, atol=1e-3) + class MyMobileOptimizedTagTest(torch.nn.Module): + def __init__(self): + super(MyMobileOptimizedTagTest, self).__init__() + self.linear_weight = torch.nn.Parameter(torch.Tensor(torch.rand(linear_weight_shape))) + self.linear_bias = torch.nn.Parameter(torch.Tensor(torch.rand((weight_output_dim)))) + + def forward(self, x): + o = F.linear(x, self.linear_weight, self.linear_bias) + return F.relu(o) + + mobile_optimized_tag_module = MyMobileOptimizedTagTest() + m = torch.jit.script(mobile_optimized_tag_module) + m.eval() + opt_m = optimize_for_mobile(m) + tag = getattr(opt_m, "mobile_optimized", None) + self.assertTrue(tag) + class MyPreserveMethodsTest(torch.nn.Module): def __init__(self): super(MyPreserveMethodsTest, self).__init__() diff --git a/test/test_nn.py b/test/test_nn.py index 07070d0e550b..7e74d0719eb4 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -33,6 +33,7 @@ from torch.autograd.gradcheck import gradgradcheck from torch.nn import Parameter from torch.nn.parallel._functions import Broadcast +from torch.testing import get_all_fp_dtypes from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ TEST_NUMPY, TEST_SCIPY, TEST_WITH_ROCM, download_file, \ get_function_arglist, load_tests, repeat_test_for_types, ALL_TENSORTYPES, \ @@ -52,6 +53,10 @@ from torch.testing._internal.common_utils import _assertGradAndGradgradChecks from torch.testing._internal.common_utils import dtype2prec_DONTUSE from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, tf32_off, tf32_on +from torch.types import _TensorOrTensors + + +AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -312,15 +317,19 @@ class TestNN(NNTestCase): _do_cuda_memory_leak_check = True _do_cuda_non_default_stream = True - def _forward(self, module, input): + def _forward(self, module, input: _TensorOrTensors): with freeze_rng_state(): - return module(input) + if isinstance(input, tuple): + return module(*input) + else: + return module(input) - def _backward(self, module, input, output, grad_output, create_graph=False): + def _backward(self, module, input: _TensorOrTensors, output, grad_output, create_graph=False): output.backward(grad_output, retain_graph=True, create_graph=create_graph) - if input.grad is None: - return None - return input.grad.data + if isinstance(input, tuple): + return tuple(map(lambda i: i.grad.data if i.grad is not None else None, input)) + else: + return input.grad.data if input.grad is not None else None def _forward_criterion(self, criterion, input, target, extra_args=None): if extra_args is None: @@ -2646,6 +2655,19 @@ def test_weight_norm(self): m = torch.nn.utils.weight_norm(m) m = torch.nn.utils.weight_norm(m) + def test_parameterlistdict_setting_attributes(self): + mod = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)])) + + with self.assertWarnsRegex(UserWarning, + r"Setting attributes on ParameterList is not supported"): + torch.nn.utils.weight_norm(mod, "0") + + mod = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))}) + + with self.assertWarnsRegex(UserWarning, + r"Setting attributes on ParameterDict is not supported"): + torch.nn.utils.weight_norm(mod, "b") + def test_weight_norm_pickle(self): m = torch.nn.utils.weight_norm(nn.Linear(5, 7)) m = pickle.loads(pickle.dumps(m)) @@ -3910,6 +3932,15 @@ def test_Conv2d_inconsistent_types_on_GPU_without_cudnn(self): # but it should work with the same type nn.functional.conv2d(inputs.float(), weights.float(), bias.float()) + def test_Conv2d_1x1(self): + in_channels = 2 + out_channels = 2 + mod = torch.nn.Conv2d(2, 2, 1, bias=False).to(dtype=torch.double) + input = torch.randn(1, in_channels, 5, 5, requires_grad=True, dtype=torch.double) + for enabled in (False, True): + with torch.backends.mkldnn.flags(enabled=enabled): + gradcheck(F.conv2d, (input, mod.weight)) + @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') def test_cudnn_non_contiguous(self): @@ -3939,7 +3970,7 @@ def test_Conv2d_inconsistent_types_on_GPU_with_cudnn(self): @unittest.skipIf(not TEST_CUDA, 'CUDA not available') @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') - @repeat_test_for_types(ALL_TENSORTYPES2) + @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) def test_Conv2d_deterministic_cudnn(self, dtype=torch.float): inputs = torch.randn(2, 3, 5, 5, device="cuda", dtype=dtype, requires_grad=True) with cudnn.flags(enabled=True, benchmark=True, deterministic=True): @@ -3969,7 +4000,7 @@ def test_Conv2d_backward_twice(self): lambda: o1.sum().backward()) @unittest.skipIf(not TEST_CUDA, 'CUDA not available') - @repeat_test_for_types(ALL_TENSORTYPES2) + @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) def test_Conv2d_large_workspace(self, dtype=torch.float): # These sizes require huge cuDNN workspaces. Make sure we choose a # reasonable algorithm that does not run out of memory @@ -4096,7 +4127,7 @@ def test_Conv2d_groups_nobias(self): dev_dtypes = [("cpu", torch.float)] if TEST_CUDA: dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)] - if TEST_WITH_ROCM: + if AMPERE_OR_ROCM: dev_dtypes += [("cuda", torch.bfloat16)] for device, dtype in dev_dtypes: m = nn.Conv2d(4, 4, kernel_size=3, groups=2, bias=False).to(device, dtype) @@ -4134,7 +4165,7 @@ def test_Conv2d_groups_nobias_v2(self): dev_dtypes = [("cpu", torch.float)] if TEST_CUDA: dev_dtypes += [("cuda", torch.float), ("cuda", torch.half)] - if TEST_WITH_ROCM: + if AMPERE_OR_ROCM: dev_dtypes += [("cuda", torch.bfloat16)] for device, dtype in dev_dtypes: m = nn.Conv2d(4, 16, kernel_size=3, groups=2, bias=False).to(device, dtype) @@ -5147,6 +5178,493 @@ def test_transformerdecoderlayer_gelu(self): self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) torch.testing.assert_allclose(result, ref_output) + def test_transformerencoder(self): + def get_a_test_layer(use_cuda, activation): + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + device = torch.device("cuda" if use_cuda else "cpu") + + layer = nn.TransformerEncoderLayer( + d_model, + nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation).to(device) + + with torch.no_grad(): + # set constant weights of the model + for idx, p in enumerate(layer.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = torch.cos(torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + return layer + + # this is a deterministic test for TransformerEncoder + activation = "relu" + use_cuda = torch.cuda.is_available() + device = torch.device("cuda" if use_cuda else "cpu") + + encoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation) + + model = nn.TransformerEncoder(encoder_layer, 1).to(device) + + # deterministic input + encoder_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + ).to(device) + result = model(encoder_input) + ref_output = torch.Tensor([[[2.428589, 0.020835, -0.602055, -0.085249], + [2.427987, 0.021213, -0.602496, -0.084103]], + [[2.424689, 0.019155, -0.604793, -0.085672], + [2.413863, 0.022211, -0.612486, -0.072490]], + [[2.433774, 0.021598, -0.598343, -0.087548], + [2.425104, 0.019748, -0.604515, -0.084839]], + [[2.436185, 0.022682, -0.596625, -0.087261], + [2.433556, 0.021891, -0.598509, -0.086832]], + [[2.416246, 0.017512, -0.610712, -0.082961], + [2.422901, 0.024187, -0.606178, -0.074929]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # all 0 + mask = torch.zeros([2, 5]).to(device) == 1 + result = model(encoder_input, src_key_padding_mask=mask) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + mask[0, 1] = 1 + mask[1, 3] = 1 + mask[1, 4] = 1 + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = torch.Tensor([[[2.429026, 0.020793, -0.601741, -0.085642], + [2.428811, 0.021445, -0.601912, -0.084252]], + [[2.425009, 0.019155, -0.604566, -0.085899], + [2.415408, 0.02249, -0.611415, -0.073]], + [[2.434199, 0.021682, -0.598039, -0.087699], + [2.42598, 0.019941, -0.603896, -0.085091]], + [[2.436457, 0.022736, -0.59643, -0.08736], + [2.434021, 0.022093, -0.598179, -0.08679]], + [[2.416531, 0.017498, -0.610513, -0.083181], + [2.4242, 0.024653, -0.605266, -0.074959]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # test case 2, multiple layers no norm + model = nn.TransformerEncoder(encoder_layer, 2).to(device) + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = torch.Tensor( + [[[2.419051, 0.017446, -0.608738, -0.085003], + [2.419102, 0.017452, -0.608703, -0.085026]], + [[2.419043, 0.017445, -0.608744, -0.084999], + [2.419052, 0.017446, -0.608738, -0.085004]], + [[2.419067, 0.017448, -0.608727, -0.085010], + [2.419098, 0.017452, -0.608706, -0.085024]], + [[2.419072, 0.017449, -0.608724, -0.085012], + [2.419119, 0.017455, -0.608691, -0.085034]], + [[2.419019, 0.017442, -0.608761, -0.084989], + [2.419075, 0.017449, -0.608722, -0.085014]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + model = nn.TransformerEncoder(encoder_layer, 6).to(device) + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = torch.Tensor( + [[[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]], + [[2.419101, 0.017453, -0.608703, -0.085025], + [2.419101, 0.017453, -0.608704, -0.085025]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # test case 3, multiple layers with norm + # d_model = 4 + norm = nn.LayerNorm(4) + model = nn.TransformerEncoder(encoder_layer, 2, norm=norm).to(device) + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = torch.Tensor( + [[[1.695949, -0.357635, -0.893077, -0.445238], + [1.695955, -0.357639, -0.893050, -0.445266]], + [[1.695948, -0.357634, -0.893082, -0.445233], + [1.695950, -0.357635, -0.893077, -0.445238]], + [[1.695951, -0.357636, -0.893069, -0.445246], + [1.695955, -0.357639, -0.893052, -0.445264]], + [[1.695952, -0.357636, -0.893066, -0.445249], + [1.695957, -0.357641, -0.893041, -0.445276]], + [[1.695946, -0.357632, -0.893095, -0.445220], + [1.695952, -0.357637, -0.893065, -0.445251]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + model = nn.TransformerEncoder(encoder_layer, 6, norm=norm).to(device) + result = model(encoder_input, src_key_padding_mask=mask) + ref_output = torch.Tensor( + [[[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]], + [[1.695955, -0.357639, -0.893051, -0.445265], + [1.695955, -0.357639, -0.893051, -0.445265]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + + def test_transformerdecoder(self): + def get_a_test_layer(use_cuda, activation): + d_model = 4 + nhead = 2 + dim_feedforward = 16 + dropout = 0.0 + device = torch.device("cuda" if use_cuda else "cpu") + + layer = nn.TransformerDecoderLayer( + d_model, + nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation).to(device) + + with torch.no_grad(): + # set constant weights of the model + for idx, p in enumerate(layer.parameters()): + x = p.data + sz = x.view(-1).size(0) + shape = x.shape + x = torch.cos(torch.arange(0, sz).float().view(shape)) + p.data.copy_(x) + + return layer + + # this is a deterministic test for TransformerDecoder + activation = "relu" + use_cuda = torch.cuda.is_available() + device = torch.device("cuda" if use_cuda else "cpu") + + decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation) + + model = nn.TransformerDecoder(decoder_layer, 1).to(device) + + # deterministic input + decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device) + memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # deterministic input + decoder_input = torch.Tensor([[[9, 10, 11, 12]], + [[11, 12, 13, 14]]]).to(device) + memory_input = torch.Tensor([[[1, 2, 3, 4]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.422245, 0.051716, -0.606338, -0.024756]], + [[2.422245, 0.051716, -0.606338, -0.024756]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # deterministic input + decoder_input = torch.Tensor([[[1, 2, 3, 4]], + [[5, 6, 7, 8]]]).to(device) + memory_input = torch.Tensor([[[9, 10, 11, 12]], + [[11, 12, 13, 14]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.343536, 0.085561, -0.654954, 0.074991]], + [[2.343536, 0.085561, -0.654954, 0.074991]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # deterministic input + decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + ).to(device) + memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + ).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # key_padding_mask + key_padding_mask = torch.zeros(2, 3).to(device) == 1 + result = model(decoder_input, + memory_input, + tgt_key_padding_mask=key_padding_mask) + ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # key_padding_mask + key_padding_mask[0, 2] = 1 + key_padding_mask[1, 1] = 1 + key_padding_mask[1, 2] = 1 + result = model(decoder_input, + memory_input, + tgt_key_padding_mask=key_padding_mask) + ref_output = torch.Tensor([[[2.430025, 0.027643, -0.601164, -0.073476], + [2.4323, 0.029375, -0.599553, -0.071881]], + [[2.428523, 0.026838, -0.602226, -0.07391], + [2.432634, 0.029842, -0.599318, -0.071253]], + [[2.432278, 0.028152, -0.599555, -0.074139], + [2.432659, 0.029244, -0.599294, -0.072382]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # memory_key_padding_mask + key_padding_mask = torch.zeros(2, 5).to(device) == 1 + result = model(decoder_input, + memory_input, + memory_key_padding_mask=key_padding_mask) + ref_output = torch.Tensor([[[2.430065, 0.027862, -0.601136, -0.073096], + [2.431935, 0.028907, -0.599809, -0.072488]], + [[2.428457, 0.027053, -0.602275, -0.073462], + [2.431970, 0.029387, -0.599789, -0.071621]], + [[2.431934, 0.028196, -0.599802, -0.073809], + [2.432306, 0.028858, -0.599542, -0.072846]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # memory_key_padding_mask + key_padding_mask[0, 4] = 1 + key_padding_mask[1, 3] = 1 + key_padding_mask[1, 4] = 1 + result = model(decoder_input, + memory_input, + memory_key_padding_mask=key_padding_mask) + ref_output = torch.Tensor([[[2.429757, 0.027358, -0.601351, -0.073816], + [2.432692, 0.028583, -0.599263, -0.073634]], + [[2.428247, 0.02662, -0.602419, -0.074123], + [2.432657, 0.029055, -0.599293, -0.072732]], + [[2.431515, 0.027687, -0.600096, -0.074459], + [2.433075, 0.028543, -0.598987, -0.073985]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # multiple layers no norm + model = nn.TransformerDecoder(decoder_layer, 2).to(device) + + # deterministic input + decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device) + memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # multiple layers no norm + model = nn.TransformerDecoder(decoder_layer, 6).to(device) + + # deterministic input + decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + ).to(device) + memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + ).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.42794, 0.026164, -0.60263, -0.0747591], + [2.43113, 0.0279516, -0.600376, -0.0736896]], + [[2.42794, 0.026164, -0.60263, -0.0747591], + [2.43113, 0.0279516, -0.600376, -0.0736896]], + [[2.42794, 0.026164, -0.60263, -0.0747591], + [2.43113, 0.0279516, -0.600376, -0.0736896]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # multiple layers with norm + # d_model = 4 + norm = nn.LayerNorm(4) + model = nn.TransformerDecoder(decoder_layer, 2, norm=norm).to(device) + + # deterministic input + decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device) + memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # multiple layers with norm + model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device) + + # deterministic input + decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + ).to(device) + memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + ).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[1.69559, -0.357291, -0.894741, -0.443553], + [1.69571, -0.357363, -0.894154, -0.444196]], + [[1.69559, -0.357291, -0.894741, -0.443553], + [1.69571, -0.357363, -0.894154, -0.444196]], + [[1.69559, -0.357291, -0.894741, -0.443553], + [1.69571, -0.357363, -0.894154, -0.444196]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output, rtol=1e-7, atol=1e-5) + + # gelu activation test cases + activation = "gelu" + use_cuda = torch.cuda.is_available() + device = torch.device("cuda" if use_cuda else "cpu") + + decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation) + + model = nn.TransformerDecoder(decoder_layer, 1).to(device) + + # deterministic input + decoder_input = torch.Tensor([[[20, 30, 40, 50]]]).to(device) + memory_input = torch.Tensor([[[60, 70, 80, 90]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]] + ).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output) + + # deterministic input + decoder_input = torch.Tensor([[[9, 10, 11, 12]], + [[11, 12, 13, 14]]]).to(device) + memory_input = torch.Tensor([[[1, 2, 3, 4]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.415448, 0.054389, -0.610932, -0.0156613]], + [[2.415448, 0.054389, -0.610932, -0.0156613]]]).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output) + + # deterministic input + decoder_input = torch.Tensor([[[1, 2, 3, 4]], + [[5, 6, 7, 8]]]).to(device) + memory_input = torch.Tensor([[[9, 10, 11, 12]], + [[11, 12, 13, 14]]]).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.338531, 0.087709, -0.65776, 0.080646]], + [[2.338531, 0.087709, -0.65776, 0.080646]]]).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output) + + # deterministic input + decoder_input = torch.Tensor([[[0.4517, 0.6793, 0.5313, 0.0034], + [0.2678, 0.3677, 0.4459, 0.7166]], + [[0.8100, 0.3716, 0.4096, 0.1976], + [0.6958, 0.8844, 0.6081, 0.8315]], + [[0.0494, 0.9343, 0.5955, 0.3830], + [0.5404, 0.3464, 0.9378, 0.6200]]] + ).to(device) + memory_input = torch.Tensor([[[0.7462, 0.6653, 0.5679, 0.4891], + [0.5387, 0.1655, 0.3565, 0.0471]], + [[0.8335, 0.2799, 0.5031, 0.2947], + [0.1402, 0.0318, 0.7636, 0.1346]], + [[0.6333, 0.9344, 0.1376, 0.9938], + [0.8924, 0.2872, 0.6692, 0.2944]], + [[0.9897, 0.6915, 0.3154, 0.1733], + [0.8645, 0.3513, 0.3064, 0.0767]], + [[0.8117, 0.2366, 0.4838, 0.7881], + [0.3718, 0.4945, 0.9511, 0.0864]]] + ).to(device) + result = model(decoder_input, memory_input) + ref_output = torch.Tensor( + [[[2.42049104, 0.03443088, -0.60793706, -0.05436271], + [2.42210631, 0.03546578, -0.60679895, -0.05357488]], + [[2.41907674, 0.0336104, -0.60892977, -0.05490462], + [2.42216881, 0.03586554, -0.6067524, -0.05289126]], + [[2.42205716, 0.03488046, -0.60683681, -0.05460596], + [2.42240309, 0.0354595, -0.60659063, -0.05378816]]]).to(device) + self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) + torch.testing.assert_allclose(result, ref_output) + + @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available') def test_cudnn_rnn_dropout_states_device(self): rnn = nn.RNN(10, 20, num_layers=2, dropout=.5) @@ -5880,7 +6398,7 @@ def test_inplace_thnn(self): self.assertEqual(grad_output, grad_output_clone) @unittest.skipIf(not TEST_CUDA, 'CUDA not available') - @repeat_test_for_types(ALL_TENSORTYPES2) + @repeat_test_for_types(get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) def test_noncontig_conv_grad_cuda(self, dtype=torch.float): # FIXME: remove after adding non-contiguous grad tests for all modules module = nn.Conv2d(3, 5, kernel_size=3, padding=1).to("cuda", dtype) @@ -6523,6 +7041,10 @@ def test_l1_loss_correct(self): torch.nn.L1Loss()(input, torch.zeros_like(input)), input.abs().mean()) + def test_smoothl1loss_negative_beta_not_supported(self): + with self.assertRaises(RuntimeError): + F.smooth_l1_loss(torch.randn(2, 2), torch.randn(2, 2), beta=-1.0) + def test_cosine_similarity(self): input1 = torch.randn(4, 4, requires_grad=True) input2 = torch.randn(4, 4, requires_grad=True) @@ -8628,6 +9150,18 @@ def test_fuse_module_eval_numerics(self, X, running_mean, running_var): self.assertEqual(Y_ref, Y_hat, msg="Conv+BN fusion results are off") + na_bn_ref = torch.nn.BatchNorm2d(oC, affine=False) + na_bn_ref.running_mean = torch.from_numpy(running_mean[0]).to(torch.double) + na_bn_ref.running_var = torch.from_numpy(running_var[0]).to(torch.double) + na_bn_ref.eval() + + Y_ref = na_bn_ref(conv_ref(inputs)) + conv_na_bn_fused = torch.nn.utils.fusion.fuse_conv_bn_eval(conv_ref, + na_bn_ref) + Y_hat = conv_na_bn_fused(inputs) + + self.assertEqual(Y_ref, Y_hat, msg="Conv+BN(non-affine) fusion results are off") + class TestAddRelu(TestCase): def test_add_relu(self): @@ -9853,6 +10387,7 @@ def v(fn): v(lambda: F.multilabel_margin_loss(input, zeros, reduction=reduction)) v(lambda: F.triplet_margin_loss(input, input, input, reduction=reduction)) + v(lambda: F.triplet_margin_with_distance_loss(input, input, input, reduction=reduction)) v(lambda: F.margin_ranking_loss(input, input, input.sign(), reduction=reduction)) v(lambda: F.cosine_embedding_loss(input, input, input[:, 0].sign(), reduction=reduction)) @@ -10683,6 +11218,63 @@ def test_contig_wrong_stride_cudnn(self, device): F.conv_transpose2d(x, torch.randn(16, 1, 1, 1, device=device)) F.conv2d(x, torch.randn(1, 16, 1, 1, device=device)) + @onlyCUDA + def test_Conv2d_size_1_kernel(self, device): + x_cpu = torch.randn(2, 3, 5, 5) + conv_cpu = torch.nn.Conv2d(3, 3, kernel_size=1) + y_cpu = conv_cpu(x_cpu) + y = torch.rand_like(y_cpu) + y_cpu.backward(y) + + with cudnn.flags(enabled=False): + conv_cuda = torch.nn.Conv2d(3, 3, kernel_size=1).to(device) + conv_cuda.bias.data.copy_(conv_cpu.bias.data) + conv_cuda.weight.data.copy_(conv_cpu.weight.data) + y_cuda = conv_cuda(x_cpu.to(device)) + y_cuda.backward(y.to(device)) + + self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False) + + @onlyCUDA + def test_ConvTranspose2d_size_1_kernel(self, device): + x_cpu = torch.randn(2, 3, 5, 5) + conv_cpu = torch.nn.ConvTranspose2d(3, 3, kernel_size=1) + y_cpu = conv_cpu(x_cpu) + y = torch.rand_like(y_cpu) + y_cpu.backward(y) + + with cudnn.flags(enabled=False): + conv_cuda = torch.nn.ConvTranspose2d(3, 3, kernel_size=1).to(device) + conv_cuda.bias.data.copy_(conv_cpu.bias.data) + conv_cuda.weight.data.copy_(conv_cpu.weight.data) + y_cuda = conv_cuda(x_cpu.to(device)) + y_cuda.backward(y.to(device)) + + self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False) + + @onlyCUDA + def test_ConvTranspose3d_size_1_kernel(self, device): + x_cpu = torch.randn(2, 3, 3, 5, 5) + conv_cpu = torch.nn.ConvTranspose3d(3, 3, kernel_size=1) + y_cpu = conv_cpu(x_cpu) + y = torch.rand_like(y_cpu) + y_cpu.backward(y) + + with cudnn.flags(enabled=False): + conv_cuda = torch.nn.ConvTranspose3d(3, 3, kernel_size=1).to(device) + conv_cuda.bias.data.copy_(conv_cpu.bias.data) + conv_cuda.weight.data.copy_(conv_cpu.weight.data) + y_cuda = conv_cuda(x_cpu.to(device)) + y_cuda.backward(y.to(device)) + + self.assertEqual(y_cpu, y_cuda, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.bias.grad.data, conv_cuda.bias.grad.data, atol=1e-5, rtol=0, exact_device=False) + self.assertEqual(conv_cpu.weight.grad.data, conv_cuda.weight.grad.data, atol=1e-5, rtol=0, exact_device=False) + def _ordered_sequence(self, device, dtype): """Create ordered list of random sequences""" seqs = [torch.empty(random.randint(1, 6), device=device, dtype=dtype) @@ -11287,7 +11879,7 @@ def test_multihead_attention_dtype(self, device, dtype): self.assertEqual(q.size(), out[0].size()) self.assertEqual(dtype, out[0].dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) @dtypes(torch.float) def test_Conv2d_naive_groups(self, device, dtype): # Check that grouped convolutions matches two half convolutions @@ -11537,32 +12129,32 @@ def expected_output(dim): self.assertEqual(output[0, 0, 0, 0], float("-inf")) self.assertEqual(indices[0, 0, 0, 0], 0) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_MaxPool1d_indices(self, device, dtype): self._test_maxpool_indices(1, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_MaxPool2d_indices(self, device, dtype): self._test_maxpool_indices(2, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_MaxPool3d_indices(self, device, dtype): self._test_maxpool_indices(3, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_AdaptiveMaxPool1d_indices(self, device, dtype): self._test_maxpool_indices(1, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_AdaptiveMaxPool2d_indices(self, device, dtype): self._test_maxpool_indices(2, adaptive=True, device=device, dtype=dtype) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_AdaptiveMaxPool3d_indices(self, device, dtype): self._test_maxpool_indices(3, adaptive=True, device=device, dtype=dtype) @@ -11635,7 +12227,7 @@ def test_pooling_zero_stride(self, device): self.assertRaisesRegex(RuntimeError, r"stride should not be zero|stride must be greater than zero", lambda: fn_module(x)) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_pool_large_size(self, device, dtype): for op in ('max', 'avg'): @@ -11649,7 +12241,7 @@ def test_pool_large_size(self, device, dtype): # check if the output shape was still computed correctly self.assertEqual(x.shape[2], res.shape[2]) - @dtypesIfCUDA(*ALL_TENSORTYPES2) + @dtypesIfCUDA(*get_all_fp_dtypes()) @dtypes(torch.float) def test_pool_invalid_size(self, device, dtype): for op in ('max', 'avg'): @@ -11945,6 +12537,7 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device): @onlyCUDA @skipCUDAIfRocm @skipCUDAIfCudnnVersionLessThan(7603) + @tf32_on_and_off(0.05) def test_conv_cudnn_mismatch_memory_format(self, device): configs = [ [4, 2, 8, 8, 4, 2], @@ -12171,6 +12764,85 @@ def test_threshold_inplace_overlap(self, device): F.threshold(x, 0.5, 0.5, inplace=True) F.threshold_(x, 0.5, 0.5) + @onlyOnCPUAndCUDA + def test_triplet_margin_with_distance_loss_default_parity(self, device): + # Test for `nn.TripletMarginWithDistanceLoss` and + # `F.triplet_margin_with_distance_loss`. Checks + # for parity against the respective non-distance-agnostic + # implementations of triplet margin loss (``nn.TripletMarginLoss` + # and `F.triplet_margin_loss`) under *default args*. + + for extra_args in \ + itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')): + kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]} + + anchor = torch.randn(5, 10, device=device, requires_grad=True) + positive = torch.randn(5, 10, device=device, requires_grad=True) + negative = torch.randn(5, 10, device=device, requires_grad=True) + + # Test forward, functional + expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs) + actual = F.triplet_margin_with_distance_loss(anchor, positive, negative, **kwargs) + self.assertEqual(actual, expected, rtol=1e-6, atol=1e-6) + + # Test forward, module + loss_ref = nn.TripletMarginLoss(**kwargs) + loss_op = nn.TripletMarginWithDistanceLoss(**kwargs) + self.assertEqual(loss_op(anchor, positive, negative), + loss_ref(anchor, positive, negative), + rtol=1e-6, atol=1e-6) + + # Test backward + self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( + a, p, n, **kwargs), (anchor, positive, negative))) + self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n), + (anchor, positive, negative))) + + @onlyOnCPUAndCUDA + def test_triplet_margin_with_distance_loss(self, device): + # Test for parity between `nn.TripletMarginWithDistanceLoss` and + # `F.triplet_margin_with_distance_loss`. + + pairwise_distance = nn.PairwiseDistance() + + def cosine_distance(x, y): + return 1.0 - F.cosine_similarity(x, y) + + distance_functions = (pairwise_distance, cosine_distance, + lambda x, y: 1.0 - F.cosine_similarity(x, y)) + + reductions = ('mean', 'none', 'sum') + margins = (1.0, 1.5, 0.5) + swaps = (True, False) + + for distance_fn, reduction, margin, swap \ + in itertools.product(distance_functions, reductions, margins, swaps): + anchor = torch.randn(5, 10, device=device, requires_grad=True) + positive = torch.randn(5, 10, device=device, requires_grad=True) + negative = torch.randn(5, 10, device=device, requires_grad=True) + + # Test backward + self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( + a, p, n, distance_function=distance_fn, reduction=reduction, margin=margin, swap=swap), + (anchor, positive, negative))) + loss_op = nn.TripletMarginWithDistanceLoss(distance_function=distance_fn, + reduction=reduction, margin=margin, swap=swap) + self.assertTrue(gradcheck(lambda a, p, n: loss_op( + a, p, n), (anchor, positive, negative))) + traced_loss_op = torch.jit.trace(loss_op, (anchor, positive, negative)) + self.assertTrue(gradcheck(lambda a, p, n: traced_loss_op( + a, p, n), (anchor, positive, negative))) + + # Test forward parity + functional = F.triplet_margin_with_distance_loss(anchor, positive, negative, + distance_function=distance_fn, + reduction=reduction, margin=margin, swap=swap) + modular = loss_op(anchor, positive, negative) + traced = traced_loss_op(anchor, positive, negative) + self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6) + self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6) + + class TestModuleGlobalHooks(TestCase): def tearDown(self): diff --git a/test/test_op_aliases.py b/test/test_op_aliases.py index 7ad691328c4b..8a106d7860d1 100644 --- a/test/test_op_aliases.py +++ b/test/test_op_aliases.py @@ -45,7 +45,7 @@ def __init__(self, decorators=(skipCPUIfNoLapack, skipCUDAIfNoMagma)), # NOTE: only runs on CPU because it leaks CUDA memory # (see https://github.com/pytorch/pytorch/issues/43119) - AliasInfo('outer', torch.outer, 'ger', torch.ger, + AliasInfo('ger', torch.ger, 'outer', torch.outer, lambda d: torch.randn(20, device=d), get_args=lambda d: (torch.randn(20, device=d),), decorators=(onlyCPU,)), AliasInfo('arccosh', torch.arccosh, 'acosh', torch.acosh, diff --git a/test/test_ops.py b/test/test_ops.py index 28570d9892ab..5be450d4d41f 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -131,8 +131,31 @@ def test_inplace_gradgrad(self, device, dtype, op): self._gradgrad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace())) +class TestOut(TestCase): + exact_dtype = True + + @ops(op_db) + def test_out(self, device, dtype, op): + if not op.supports_tensor_out: + self.skipTest("Skipped! Operator %s does not support out=..." % op.name) + samples = op.sample_inputs(device, dtype) + if len(samples) == 0: + self.skipTest("Skipped! No sample inputs!") + + # NOTE: only tests on first sample + sample = samples[0] + # call it normally to get the expected result + expected = op(sample.input, *sample.args, **sample.kwargs) + # call it with out=... and check we get the expected result + out_kwargs = sample.kwargs.copy() + out_kwargs['out'] = out = torch.empty_like(expected) + op(sample.input, *sample.args, **out_kwargs) + self.assertEqual(expected, out) + + instantiate_device_type_tests(TestOpInfo, globals()) instantiate_device_type_tests(TestGradients, globals()) +instantiate_device_type_tests(TestOut, globals()) if __name__ == '__main__': run_tests() diff --git a/test/test_optim.py b/test/test_optim.py index b00184cc9343..3e3e6610fa01 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -6,6 +6,7 @@ import torch from torch._six import inf import torch.optim as optim +import torch.optim._multi_tensor as optim_mt import torch.nn.functional as F from torch.optim import SGD from torch.autograd import Variable @@ -249,105 +250,199 @@ def _build_params_dict_single(self, weight, bias, **kwargs): return [dict(params=bias, **kwargs)] def test_sgd(self): - self._test_basic_cases( - lambda weight, bias: optim.SGD([weight, bias], lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.SGD( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.SGD( - self._build_params_dict_single(weight, bias, lr=1e-2), - lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.SGD( - self._build_params_dict_single(weight, bias, lr=1e-2)) - ) - self._test_basic_cases( - lambda weight, bias: optim.SGD([weight, bias], lr=1e-3), - [lambda opt: StepLR(opt, gamma=0.9, step_size=10)] - ) - self._test_basic_cases( - lambda weight, bias: optim.SGD([weight, bias], lr=1e-3), - [lambda opt: StepLR(opt, gamma=0.9, step_size=10), - lambda opt: ReduceLROnPlateau(opt)] - ) - self._test_basic_cases( - lambda weight, bias: optim.SGD([weight, bias], lr=1e-3), - [lambda opt: StepLR(opt, gamma=0.99, step_size=10), - lambda opt: ExponentialLR(opt, gamma=0.99), - lambda opt: ReduceLROnPlateau(opt)] - ) - with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"): - optim.SGD(None, lr=1e-2, momentum=-0.5) + for optimizer in [optim.SGD, optim_mt.SGD]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict_single(weight, bias, lr=1e-2), + lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict_single(weight, bias, lr=1e-2)) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3), + [lambda opt: StepLR(opt, gamma=0.9, step_size=10)] + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3), + [lambda opt: StepLR(opt, gamma=0.9, step_size=10), + lambda opt: ReduceLROnPlateau(opt)] + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3), + [lambda opt: StepLR(opt, gamma=0.99, step_size=10), + lambda opt: ExponentialLR(opt, gamma=0.99), + lambda opt: ReduceLROnPlateau(opt)] + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, momentum=1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, momentum=1, weight_decay=1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], nesterov=True, lr=1e-3, momentum=1, weight_decay=1) + ) + with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"): + optimizer(None, lr=1e-2, momentum=-0.5) def test_sgd_sparse(self): - self._test_rosenbrock_sparse( - lambda params: optim.SGD(params, lr=5e-3) - ) - self._test_rosenbrock_sparse( - lambda params: optim.SGD(params, lr=0.005), - [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)] - ) + for optimizer in [optim.SGD, optim_mt.SGD]: + self._test_rosenbrock_sparse( + lambda params: optimizer(params, lr=5e-3) + ) + self._test_rosenbrock_sparse( + lambda params: optimizer(params, lr=0.005), + [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)] + ) - def test_adam(self): - self._test_basic_cases( - lambda weight, bias: optim.Adam([weight, bias], lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adam( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adam([weight, bias], lr=1e-3, - amsgrad=True) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adam( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3, amsgrad=True) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adam( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3), - [lambda opt: ExponentialLR(opt, gamma=0.9)] - ) - self._test_basic_cases( - lambda weight, bias: optim.Adam([weight, bias], lr=1e-3, - amsgrad=True), - [lambda opt: ExponentialLR(opt, gamma=0.9), - lambda opt: ReduceLROnPlateau(opt)] - ) - self._test_basic_cases( - lambda weight, bias: optim.Adam( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3, amsgrad=True), - [lambda opt: StepLR(opt, gamma=0.9, step_size=10), - lambda opt: ReduceLROnPlateau(opt)] - ) - with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"): - optim.Adam(None, lr=1e-2, betas=(1.0, 0.0)) + @skipIfRocm + def test_multi_tensor_optimizers(self): + if not torch.cuda.is_available(): + return - with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"): - optim.Adam(None, lr=1e-2, weight_decay=-1) + optimizer_pairs_with_flags = [ + ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=1., amsgrad=True)), + ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=1., amsgrad=False)), + ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=0., amsgrad=True)), + ((optim.Adam, optim._multi_tensor.Adam), dict(weight_decay=0., amsgrad=False)), + ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=1., amsgrad=True)), + ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=1., amsgrad=False)), + ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=0., amsgrad=True)), + ((optim.AdamW, optim._multi_tensor.AdamW), dict(weight_decay=0., amsgrad=False)), + ((optim.SGD, optim._multi_tensor.SGD), dict(lr=0.2, momentum=1, dampening=0, weight_decay=1, nesterov=True)), + ((optim.SGD, optim._multi_tensor.SGD), dict(lr=0.2, momentum=1, dampening=0.5, weight_decay=1, nesterov=False)), + ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=1, centered=True)), + ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=0, centered=True)), + ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=1, momentum=1, centered=False)), + ((optim.RMSprop, optim._multi_tensor.RMSprop), dict(weight_decay=0, momentum=1, centered=False)), + ((optim.Rprop, optim._multi_tensor.Rprop), dict(lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50))), + ((optim.ASGD, optim._multi_tensor.ASGD), dict(weight_decay=0)), + ((optim.ASGD, optim._multi_tensor.ASGD), dict(weight_decay=1)), + ((optim.Adamax, optim._multi_tensor.Adamax), dict(weight_decay=0)), + ((optim.Adamax, optim._multi_tensor.Adamax), dict(weight_decay=1)), + ((optim.Adadelta, optim._multi_tensor.Adadelta), dict(weight_decay=0)), + ((optim.Adadelta, optim._multi_tensor.Adadelta), dict(weight_decay=1)), + ] + + kIterations = 1001 + device = 'cuda' + + for optimizers, params in optimizer_pairs_with_flags: + res = [] + for opt in optimizers: + weight = torch.tensor([[-0.2109, -0.4976], [-0.1413, -0.3420], [-0.2524, 0.6976]], + dtype=torch.float64, device=device, requires_grad=True) + bias = torch.tensor([-0.1085, -0.2979, 0.6892], dtype=torch.float64, device=device, requires_grad=True) + weight2 = torch.tensor([[-0.0508, -0.3941, -0.2843]], + dtype=torch.float64, device=device, requires_grad=True) + bias2 = torch.tensor([-0.0711], dtype=torch.float64, device=device, requires_grad=True) + input = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=torch.float64, device=device).reshape(3, 2) + + model = torch.nn.Sequential(torch.nn.Linear(2, 3), + torch.nn.Sigmoid(), + torch.nn.Linear(3, 1), + torch.nn.Sigmoid()) + model.to(torch.float64).to(device) + + pretrained_dict = model.state_dict() + pretrained_dict['0.weight'] = weight + pretrained_dict['0.bias'] = bias + pretrained_dict['2.weight'] = weight2 + pretrained_dict['2.bias'] = bias2 + model.load_state_dict(pretrained_dict) + + optimizer = opt(model.parameters(), **params) + + for _ in range(kIterations): + optimizer.zero_grad() + output = model(input) + loss = output.sum() + loss.backward() + + if iter == 0: + model.parameters().__next__().grad = None + + optimizer.step() + + res.append(model.parameters()) + + for p1, p2 in zip(res[0], res[1]): + self.assertEqual(p1, p2) - def test_adamw(self): - self._test_basic_cases( - lambda weight, bias: optim.AdamW([weight, bias], lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.AdamW( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3) - ) - with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"): - optim.AdamW(None, lr=1e-2, weight_decay=-1) + def test_adam(self): + for optimizer in [optim.Adam, optim_mt.Adam]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=0.1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3, amsgrad=True) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3), + [lambda opt: ExponentialLR(opt, gamma=0.9)] + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, amsgrad=True), + [lambda opt: ExponentialLR(opt, gamma=0.9), + lambda opt: ReduceLROnPlateau(opt)] + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3, amsgrad=True), + [lambda opt: StepLR(opt, gamma=0.9, step_size=10), + lambda opt: ReduceLROnPlateau(opt)] + ) + with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"): + optimizer(None, lr=1e-2, betas=(1.0, 0.0)) + + with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"): + optimizer(None, lr=1e-2, weight_decay=-1) + + def test_adamw(self): + for optimizer in [optim.AdamW, optim_mt.AdamW]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, weight_decay=1, amsgrad=True) + ) + with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"): + optimizer(None, lr=1e-2, weight_decay=-1) def test_sparse_adam(self): self._test_rosenbrock_sparse( @@ -365,21 +460,25 @@ def test_sparse_adam(self): # ROCm precision is too low to pass this test @skipIfRocm def test_adadelta(self): - self._test_basic_cases( - lambda weight, bias: optim.Adadelta([weight, bias]) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adadelta( - self._build_params_dict(weight, bias, rho=0.95)) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adadelta( - self._build_params_dict(weight, bias, rho=0.95)), - [lambda opt: StepLR(opt, gamma=0.9, step_size=10), - lambda opt: ReduceLROnPlateau(opt)] - ) - with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"): - optim.Adadelta(None, lr=1e-2, rho=1.1) + for optimizer in [optim.Adadelta, optim_mt.Adadelta]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias]) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, rho=0.95)) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, rho=0.95)), + [lambda opt: StepLR(opt, gamma=0.9, step_size=10), + lambda opt: ReduceLROnPlateau(opt)] + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], weight_decay=1) + ) + with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"): + optimizer(None, lr=1e-2, rho=1.1) def test_adagrad(self): self._test_basic_cases( @@ -421,52 +520,84 @@ def test_adagrad_sparse(self): ) def test_adamax(self): - self._test_basic_cases( - lambda weight, bias: optim.Adamax([weight, bias], lr=1e-1) - ) - self._test_basic_cases( - lambda weight, bias: optim.Adamax( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-1) - ) - with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"): - optim.Adamax(None, lr=1e-2, betas=(0.0, 1.0)) + for optimizer in [optim.Adamax, optim_mt.Adamax]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-1, weight_decay=1) + ) + with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"): + optimizer(None, lr=1e-2, betas=(0.0, 1.0)) def test_rmsprop(self): - self._test_basic_cases( - lambda weight, bias: optim.RMSprop([weight, bias], lr=1e-2) - ) - self._test_basic_cases( - lambda weight, bias: optim.RMSprop( - self._build_params_dict(weight, bias, lr=1e-3), - lr=1e-2) - ) - with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"): - optim.RMSprop(None, lr=1e-2, momentum=-1.0) + for optimizer in [optim.RMSprop, optim_mt.RMSprop]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-2) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-3), + lr=1e-2) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-3), + lr=1e-2, centered=True) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-3), + lr=1e-2, centered=True, momentum=0.1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-3), + lr=1e-2, momentum=0.1) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-3), + lr=1e-2, momentum=0.1, weight_decay=1) + ) + with self.assertRaisesRegex(ValueError, "Invalid momentum value: -1.0"): + optimizer(None, lr=1e-2, momentum=-1.0) def test_asgd(self): - self._test_basic_cases( - lambda weight, bias: optim.ASGD([weight, bias], lr=1e-3, t0=100) - ) - self._test_basic_cases( - lambda weight, bias: optim.ASGD( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3, t0=100) - ) - with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"): - optim.ASGD(None, lr=1e-2, weight_decay=-0.5) + for optimizer in [optim.ASGD, optim_mt.ASGD]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3, t0=100) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3, t0=100) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-3), + lr=1e-2, weight_decay=1) + ) + with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -0.5"): + optimizer(None, lr=1e-2, weight_decay=-0.5) def test_rprop(self): - self._test_basic_cases( - lambda weight, bias: optim.Rprop([weight, bias], lr=1e-3) - ) - self._test_basic_cases( - lambda weight, bias: optim.Rprop( - self._build_params_dict(weight, bias, lr=1e-2), - lr=1e-3) - ) - with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"): - optim.Rprop(None, lr=1e-2, etas=(1.0, 0.5)) + for optimizer in [optim.Rprop, optim_mt.Rprop]: + self._test_basic_cases( + lambda weight, bias: optimizer([weight, bias], lr=1e-3) + ) + self._test_basic_cases( + lambda weight, bias: optimizer( + self._build_params_dict(weight, bias, lr=1e-2), + lr=1e-3) + ) + with self.assertRaisesRegex(ValueError, "Invalid eta values: 1.0, 0.5"): + optimizer(None, lr=1e-2, etas=(1.0, 0.5)) def test_lbfgs(self): self._test_basic_cases( diff --git a/test/test_package.py b/test/test_package.py new file mode 100644 index 000000000000..37d7b0f385a2 --- /dev/null +++ b/test/test_package.py @@ -0,0 +1,315 @@ +from unittest import main, skipIf +from torch.testing._internal.common_utils import TestCase, IS_WINDOWS +from tempfile import NamedTemporaryFile +from torch.package import PackageExporter, PackageImporter +from pathlib import Path +from tempfile import TemporaryDirectory +import torch +from sys import version_info +from io import StringIO + +try: + from torchvision.models import resnet18 + HAS_TORCHVISION = True +except ImportError: + HAS_TORCHVISION = False +skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision") + + + +packaging_directory = Path(__file__).parent + +class PackagingTest(TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._temporary_files = [] + + def temp(self): + t = NamedTemporaryFile() + name = t.name + if IS_WINDOWS: + t.close() # can't read an open file in windows + else: + self._temporary_files.append(t) + return name + + def tearDown(self): + for t in self._temporary_files: + t.close() + self._temporary_files = [] + + def test_saving_source(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.save_source_file('foo', str(packaging_directory / 'module_a.py')) + he.save_source_file('foodir', str(packaging_directory / 'package_a')) + hi = PackageImporter(filename) + foo = hi.import_module('foo') + s = hi.import_module('foodir.subpackage') + self.assertEqual(foo.result, 'module_a') + self.assertEqual(s.result, 'package_a.subpackage') + + def test_saving_string(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + src = """\ +import math +the_math = math +""" + he.save_source_string('my_mod', src) + hi = PackageImporter(filename) + m = hi.import_module('math') + import math + self.assertIs(m, math) + my_mod = hi.import_module('my_mod') + self.assertIs(my_mod.math, math) + + def test_save_module(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + import module_a + import package_a + he.save_module(module_a.__name__) + he.save_module(package_a.__name__) + hi = PackageImporter(filename) + module_a_i = hi.import_module('module_a') + self.assertEqual(module_a_i.result, 'module_a') + self.assertIsNot(module_a, module_a_i) + package_a_i = hi.import_module('package_a') + self.assertEqual(package_a_i.result, 'package_a') + self.assertIsNot(package_a_i, package_a) + + def test_pickle(self): + import package_a.subpackage + obj = package_a.subpackage.PackageASubpackageObject() + obj2 = package_a.PackageAObject(obj) + + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.save_pickle('obj', 'obj.pkl', obj2) + hi = PackageImporter(filename) + + # check we got dependencies + sp = hi.import_module('package_a.subpackage') + # check we didn't get other stuff + with self.assertRaises(ImportError): + hi.import_module('module_a') + + obj_loaded = hi.load_pickle('obj', 'obj.pkl') + self.assertIsNot(obj2, obj_loaded) + self.assertIsInstance(obj_loaded.obj, sp.PackageASubpackageObject) + self.assertIsNot(package_a.subpackage.PackageASubpackageObject, sp.PackageASubpackageObject) + + def test_resources(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.save_text('main', 'main', "my string") + he.save_binary('main', 'main_binary', "my string".encode('utf-8')) + src = """\ +import resources +t = resources.load_text('main', 'main') +b = resources.load_binary('main', 'main_binary') +""" + he.save_source_string('main', src, is_package=True) + hi = PackageImporter(filename) + m = hi.import_module('main') + self.assertEqual(m.t, "my string") + self.assertEqual(m.b, "my string".encode('utf-8')) + + def test_extern(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.extern_modules(['package_a.subpackage', 'module_a']) + he.save_module('package_a') + hi = PackageImporter(filename) + import package_a.subpackage + import module_a + + module_a_im = hi.import_module('module_a') + hi.import_module('package_a.subpackage') + package_a_im = hi.import_module('package_a') + + self.assertIs(module_a, module_a_im) + self.assertIsNot(package_a, package_a_im) + self.assertIs(package_a.subpackage, package_a_im.subpackage) + + @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature') + def test_mock(self): + filename = self.temp() + with PackageExporter(filename, verbose=False) as he: + he.mock_modules(['package_a.subpackage', 'module_a']) + he.save_module('package_a') + hi = PackageImporter(filename) + import package_a.subpackage + _ = package_a.subpackage + import module_a + _ = module_a + + m = hi.import_module('package_a.subpackage') + r = m.result + with self.assertRaisesRegex(NotImplementedError, 'was mocked out'): + r() + + @skipIf(version_info.major < 3 or version_info.minor < 7, 'mock uses __getattr__ a 3.7 feature') + def test_custom_requires(self): + filename = self.temp() + + class Custom(PackageExporter): + def require_module(self, name, dependencies): + if name == 'module_a': + self.mock_module('module_a') + elif name == 'package_a': + self.save_source_string('package_a', 'import module_a\nresult = 5\n') + else: + raise NotImplementedError('wat') + + with Custom(filename, verbose=False) as he: + he.save_source_string('main', 'import package_a\n') + + hi = PackageImporter(filename) + hi.import_module('module_a').should_be_mocked + bar = hi.import_module('package_a') + self.assertEqual(bar.result, 5) + + @skipIfNoTorchVision + def test_resnet(self): + resnet = resnet18() + + f1 = self.temp() + + # create a package that will save it along with its code + with PackageExporter(f1, verbose=False) as e: + # put the pickled resnet in the package, by default + # this will also save all the code files references by + # the objects in the pickle + e.save_pickle('model', 'model.pkl', resnet) + + # check th debug graph has something reasonable: + buf = StringIO() + e._write_dep_graph(failing_module='torch', output_file=buf) + self.assertIn('torchvision.models.resnet', buf.getvalue()) + + # we can now load the saved model + i = PackageImporter(f1) + r2 = i.load_pickle('model', 'model.pkl') + + # test that it works + input = torch.rand(1, 3, 224, 224) + ref = resnet(input) + self.assertTrue(torch.allclose(r2(input), ref)) + + # functions exist also to get at the private modules in each package + torchvision = i.import_module('torchvision') + + f2 = self.temp() + # if we are doing transfer learning we might want to re-save + # things that were loaded from a package + with PackageExporter(f2, verbose=False) as e: + # We need to tell the exporter about any modules that + # came from imported packages so that it can resolve + # class names like torchvision.models.resnet.ResNet + # to their source code. + + e.importers.insert(0, i.import_module) + + # e.importers is a list of module importing functions + # that by default contains importlib.import_module. + # it is searched in order until the first success and + # that module is taken to be what torchvision.models.resnet + # should be in this code package. In the case of name collisions, + # such as trying to save a ResNet from two different packages, + # we take the first thing found in the path, so only ResNet objects from + # one importer will work. This avoids a bunch of name mangling in + # the source code. If you need to actually mix ResNet objects, + # we suggest reconstructing the model objects using code from a single package + # using functions like save_state_dict and load_state_dict to transfer state + # to the correct code objects. + e.save_pickle('model', 'model.pkl', r2) + + i2 = PackageImporter(f2) + r3 = i2.load_pickle('model', 'model.pkl') + self.assertTrue(torch.allclose(r3(input), ref)) + + # test we can load from a directory + import zipfile + zf = zipfile.ZipFile(f1, 'r') + + with TemporaryDirectory() as td: + zf.extractall(path=td) + iz = PackageImporter(str(Path(td) / Path(f1).name)) + r4 = iz.load_pickle('model', 'model.pkl') + self.assertTrue(torch.allclose(r4(input), ref)) + + @skipIfNoTorchVision + def test_model_save(self): + + # This example shows how you might package a model + # so that the creator of the model has flexibility about + # how they want to save it but the 'server' can always + # use the same API to load the package. + + # The convension is for each model to provide a + # 'model' package with a 'load' function that actual + # reads the model out of the archive. + + # How the load function is implemented is up to the + # the packager. + + # get our normal torchvision resnet + resnet = resnet18() + + + f1 = self.temp() + # Option 1: save by pickling the whole model + # + single-line, similar to torch.jit.save + # - more difficult to edit the code after the model is created + with PackageExporter(f1, verbose=False) as e: + e.save_pickle('model', 'pickled', resnet) + # note that this source is the same for all models in this approach + # so it can be made part of an API that just takes the model and + # packages it with this source. + src = """\ +import resources # gives you access to the importer from within the package + +# server knows to call model.load() to get the model, +# maybe in the future it passes options as arguments by convension +def load(): + return resources.load_pickle('model', 'pickled') + """ + e.save_source_string('model', src, is_package=True) + + f2 = self.temp() + # Option 2: save with state dict + # - more code to write to save/load the model + # + but this code can be edited later to adjust adapt the model later + with PackageExporter(f2, verbose=False) as e: + e.save_pickle('model', 'state_dict', resnet.state_dict()) + src = """\ +import resources # gives you access to the importer from within the package +from torchvision.models.resnet import resnet18 +def load(): + # if you want, you can later edit how resnet is constructed here + # to edit the model in the package, while still loading the original + # state dict weights + r = resnet18() + state_dict = resources.load_pickle('model', 'state_dict') + r.load_state_dict(state_dict) + return r + """ + e.save_source_string('model', src, is_package=True) + + + + # regardless of how we chose to package, we can now use the model in a server in the same way + input = torch.rand(1, 3, 224, 224) + results = [] + for m in [f1, f2]: + importer = PackageImporter(m) + the_model = importer.import_module('model').load() + r = the_model(input) + results.append(r) + + self.assertTrue(torch.allclose(*results)) + +if __name__ == '__main__': + main() diff --git a/test/test_profiler.py b/test/test_profiler.py index aefdfbb937fa..f1feff1d0af3 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -3,6 +3,7 @@ import unittest import torch +import torch.nn as nn from torch.testing._internal.common_utils import ( TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS) from torch.autograd.profiler import profile @@ -18,7 +19,7 @@ @unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN") @unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows") @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") -class TestProfiler_cuda(TestCase): +class TestProfilerCUDA(TestCase): def test_mem_leak(self): """Checks that there's no memory leak when using profiler with CUDA """ @@ -44,5 +45,60 @@ def test_mem_leak(self): self.assertTrue(not (is_increasing and max_diff > 100 * 1024), msg='memory usage is increasing, {}'.format(str(last_rss))) +class TestProfiler(TestCase): + def test_source(self): + """Checks that source code attribution works for eager, TS and autograd mode + """ + # avoid automatic inlining + prev_opt = torch._C._get_graph_executor_optimize() + torch._C._set_graph_executor_optimize(False) + + @torch.jit.script + def ts_method_2(x, y): + return torch.matmul(x, y) + + @torch.jit.script + def ts_method_1(x, y, z): + a = x + z + w = ts_method_2(x, y) + a + return w.sum() + + class DummyModule(nn.Module): + def __init__(self): + super(DummyModule, self).__init__() + self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False) + + def forward(self, x): + return self.conv(x) + + mod = DummyModule() + + with profile(with_stack=True) as p: + x = torch.randn(10, 10, requires_grad=True) + y = torch.randn(10, 10, requires_grad=True) + z = x + y + w = ts_method_1(x, y, z) + v = 2 * w + v.backward() + a = torch.randn(2, 3, 2, 2, requires_grad=True) + b = mod(a) + c = b.sum() + c.backward() + + print(p.key_averages( + group_by_stack_n=5).table( + sort_by="self_cpu_time_total", row_limit=-1)) + + for e in p.function_events: + if "aten::add" in e.name or "AddBackward" in e.name: + self.assertTrue(any(["test_profiler" in entry for entry in e.stack])) + self.assertTrue(any([( + "test_source" in entry or + "ts_method_1" in entry or + "ts_method_2" in entry) for entry in e.stack])) + + torch._C._set_graph_executor_optimize(prev_opt) + + if __name__ == '__main__': run_tests() diff --git a/test/test_sparse.py b/test/test_sparse.py index 6ecfa10c05bf..9425ca2b2a8b 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -6,6 +6,7 @@ import itertools import functools +import operator import random import unittest from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \ @@ -1727,53 +1728,182 @@ def test_narrow(self): self.assertRaises(RuntimeError, lambda: with_dense.narrow_copy(10, 0, 3)) # dim > sparseDim + denseDim - def _test_log1p_tensor(self, input, dense_tensor): + def _test_log1p_tensor(self, sparse_tensor): + dense_tensor = sparse_tensor.to_dense() expected_output = dense_tensor.log1p() - self.assertEqual(expected_output, input.log1p().to_dense()) - self.assertEqual(expected_output, input.coalesce().log1p_().to_dense()) - # test in-place op on uncoalesced input - with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported yet"): - input.log1p_() + self.assertEqual(expected_output, sparse_tensor.log1p().to_dense()) + self.assertEqual(expected_output, sparse_tensor.coalesce().log1p_().to_dense()) - input.requires_grad_() - self.assertTrue(input.requires_grad) + if self.is_uncoalesced: + # test in-place op on uncoalesced input + with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"): + sparse_tensor.log1p_() + + sparse_tensor.requires_grad_() + self.assertTrue(sparse_tensor.requires_grad) # test autograd - x = input.clone() - y = input.log1p() + x = sparse_tensor.clone() + y = sparse_tensor.log1p() with self.assertRaisesRegex(RuntimeError, "log1p of a sparse tensor is made to be non-differentiable"): y.backward(x) def test_log1p(self): - input = torch.sparse_coo_tensor( - torch.LongTensor([[0], [1], [2]]).transpose(1, 0).clone().detach(), - torch.FloatTensor([3, 4, 5]), - torch.Size([3]), - device=self.device) - self._test_log1p_tensor(input, torch.as_tensor([3, 4, 5], dtype=torch.float32)) - - # test uncoalesced input - input_uncoalesced = torch.sparse_coo_tensor( - torch.LongTensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0).clone().detach(), - torch.FloatTensor([2, 3, 4, 1, 1, 1]), - torch.Size([3]), - device=self.device) - self._test_log1p_tensor(input_uncoalesced, torch.as_tensor([3, 4, 5], dtype=torch.float32)) - - input = torch.sparse_coo_tensor( - torch.zeros([2, 0]), - torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), - torch.Size([0, 0, 5, 5, 5, 5, 5, 5, 0]), - device=self.device) - self._test_log1p_tensor(input, torch.zeros([0, 0, 5, 5, 5, 5, 5, 5, 0])) - - input = torch.sparse_coo_tensor( - torch.zeros([1, 5]), - torch.zeros([5, 6, 0]), - torch.Size([5, 6, 0]), - device=self.device) - self._test_log1p_tensor(input, torch.zeros([5, 6, 0])) + if not self.is_uncoalesced: + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2]]).transpose(1, 0), + values=torch.tensor([3.0, 4.0, 5.0]), + size=[3, ], + device=self.device + ).coalesce() + self._test_log1p_tensor(input_coalesced) + + # hybrid sparse input + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[1, 3], [2, 4]]), + values=torch.tensor([[1.0, 3.0], [5.0, 7.0]]), + size=[4, 5, 2], + device=self.device + ).coalesce() + self._test_log1p_tensor(input_coalesced) + + if self.is_uncoalesced: + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + values=torch.tensor([2.0, 3.0, 4.0, 1.0, 1.0, 1.0]), + size=[3, ], + device=self.device + ) + self._test_log1p_tensor(input_uncoalesced) + + # test on empty sparse tensor + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.zeros([2, 0]), + values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + size=[0, 0, 5, 5, 5, 5, 5, 5, 0], + device=self.device + ) + self._test_log1p_tensor(input_uncoalesced) + + def _test_neg_negative(self, sparse_tensor): + dense_tensor = sparse_tensor.to_dense() + expected_output = dense_tensor.neg() + + ops = ( + torch.neg, torch.Tensor.neg, torch.Tensor.neg_, + torch.negative, torch.Tensor.negative, torch.Tensor.negative_, + operator.neg + ) + for op in ops: + sparse_tensor_copy = sparse_tensor.clone() + self.assertEqual(expected_output, op(sparse_tensor_copy).to_dense()) + + if op in (torch.neg, torch.negative): + sparse_tensor_out = torch.zeros_like(sparse_tensor) + op(sparse_tensor, out=sparse_tensor_out) + self.assertEqual(expected_output, sparse_tensor_out.to_dense()) + + def test_neg_negative(self): + + if not self.is_uncoalesced: + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0, 1, 2]]), + values=torch.tensor([3.0, -4.0, 5.0]), + size=[3, ], + device=self.device + ).coalesce() + self._test_neg_negative(input_coalesced) + + # hybrid sparse input + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[1, 3], [2, 4]]), + values=torch.tensor([[-1.0, 3.0], [-5.0, 7.0]]), + size=[4, 5, 2], + device=self.device + ).coalesce() + self._test_neg_negative(input_coalesced) + + if self.is_uncoalesced: + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + values=torch.tensor([2.0, -3.0, -4.0, 1.0, -1.0, 1.5]), + size=[3, ], + device=self.device + ) + self._test_neg_negative(input_uncoalesced) + + # test on empty sparse tensor + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.zeros([2, 0]), + values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + size=[0, 0, 5, 5, 5, 5, 5, 5, 0], + device=self.device + ) + self._test_neg_negative(input_uncoalesced) + + def _test_asin_arcsin(self, sparse_tensor): + dense_tensor = sparse_tensor.to_dense() + expected_output = dense_tensor.asin() + + ops = ( + torch.asin, torch.Tensor.asin, + torch.arcsin, torch.Tensor.arcsin, + ) + for op in ops: + self.assertEqual(expected_output, op(sparse_tensor).to_dense()) + if op in (torch.asin, torch.arcsin): + sparse_tensor_out = torch.zeros_like(sparse_tensor) + op(sparse_tensor, out=sparse_tensor_out) + self.assertEqual(expected_output, sparse_tensor_out.to_dense()) + + for op in (torch.Tensor.asin_, torch.Tensor.arcsin_): + self.assertEqual(expected_output, op(sparse_tensor.clone().coalesce()).to_dense()) + if self.is_uncoalesced: + # test in-place op on uncoalesced input + with self.assertRaisesRegex(RuntimeError, "in-place on uncoalesced tensors is not supported"): + op(sparse_tensor) + + def test_asin_arcsin(self): + + if not self.is_uncoalesced: + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0, 1, 2, 3]]), + values=torch.tensor([0.5, -0.5, 0.7, -0.7]), + size=[4, ], + device=self.device + ).coalesce() + self._test_asin_arcsin(input_coalesced) + + # hybrid sparse input + input_coalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[1, 3], [2, 4]]), + values=torch.tensor([[-0.1, 0.24], [-0.44, 0.1]]), + size=[4, 5, 2], + device=self.device + ).coalesce() + self._test_asin_arcsin(input_coalesced) + + if self.is_uncoalesced: + # test uncoalesced input + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.tensor([[0], [1], [2], [0], [1], [2]]).transpose(1, 0), + values=torch.tensor([0.3, -0.3, -0.4, 0.3, -0.5, 0.15]), + size=[3, ], + device=self.device + ) + self._test_asin_arcsin(input_uncoalesced) + + # test on empty sparse tensor + input_uncoalesced = torch.sparse_coo_tensor( + indices=torch.zeros([2, 0]), + values=torch.zeros([0, 5, 5, 5, 5, 5, 5, 0]), + size=[0, 0, 5, 5, 5, 5, 5, 5, 0], + device=self.device + ) + self._test_asin_arcsin(input_uncoalesced) def test_mv(self): def test_shape(di, dj, dk, nnz): @@ -2458,7 +2588,7 @@ def test_sparse_to_numpy(self): t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4])) self.assertRaises(TypeError, lambda: t.numpy()) - @cpu_only + @skipIfRocm def test_softmax(self): import torch.nn.functional as F diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index 59b58fa202d6..82ed2225bda8 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -3,6 +3,7 @@ import math from contextlib import contextmanager from itertools import product +import itertools from torch.testing._internal.common_utils import \ (TestCase, run_tests, TEST_NUMPY, TEST_LIBROSA) @@ -11,7 +12,7 @@ skipCPUIfNoMkl, skipCUDAIfRocm, deviceCountAtLeast, onlyCUDA) from distutils.version import LooseVersion -from typing import Optional +from typing import Optional, List if TEST_NUMPY: @@ -115,6 +116,7 @@ def method_fn(t): @skipCPUIfNoMkl @skipCUDAIfRocm + @onlyOnCPUAndCUDA @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4}) @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) @@ -226,11 +228,13 @@ def test_fft_round_trip(self, device, dtype): def test_empty_fft(self, device, dtype): t = torch.empty(0, device=device, dtype=dtype) match = r"Invalid number of data points \([-\d]*\) specified" - fft_functions = [torch.fft.fft, torch.fft.ifft, torch.fft.hfft, - torch.fft.irfft] + fft_functions = [torch.fft.fft, torch.fft.fftn, + torch.fft.ifft, torch.fft.ifftn, + torch.fft.irfft, torch.fft.irfftn, + torch.fft.hfft] # Real-only functions if not dtype.is_complex: - fft_functions += [torch.fft.rfft, torch.fft.ihfft] + fft_functions += [torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft] for fn in fft_functions: with self.assertRaisesRegex(RuntimeError, match): @@ -242,6 +246,9 @@ def test_fft_invalid_dtypes(self, device): with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): torch.fft.rfft(t) + with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + torch.fft.rfftn(t) + with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): torch.fft.ihfft(t) @@ -292,7 +299,9 @@ def test_fft_half_errors(self, device, dtype): # TODO: Remove torch.half error when complex32 is fully implemented x = torch.randn(64, device=device).to(dtype) fft_functions = (torch.fft.fft, torch.fft.ifft, + torch.fft.fftn, torch.fft.ifftn, torch.fft.rfft, torch.fft.irfft, + torch.fft.rfftn, torch.fft.irfftn, torch.fft.hfft, torch.fft.ihfft) for fn in fft_functions: with self.assertRaisesRegex(RuntimeError, "Unsupported dtype "): @@ -300,6 +309,7 @@ def test_fft_half_errors(self, device, dtype): @skipCPUIfNoMkl @skipCUDAIfRocm + @onlyOnCPUAndCUDA @dtypes(torch.double, torch.complex128) # gradcheck requires double def test_fft_backward(self, device, dtype): test_args = list(product( @@ -340,6 +350,241 @@ def test_fn(x): self.assertTrue(torch.autograd.gradcheck(test_fn, (input,))) + # nd-fft tests + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @precisionOverride({torch.complex64: 1e-4, torch.float: 1e-4}) + @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + def test_fftn_numpy(self, device, dtype): + norm_modes = ((None, "forward", "backward", "ortho") + if LooseVersion(np.__version__) >= '1.20.0' + else (None, "ortho")) + + # input_ndim, s, dim + transform_desc = [ + *product(range(2, 5), (None,), (None, (0,), (0, -1))), + *product(range(2, 5), (None, (4, 10)), (None,)), + (6, None, None), + (5, None, (1, 3, 4)), + (3, None, (0, -1)), + (3, None, (1,)), + (1, None, (0,)), + (4, (10, 10), None), + (4, (10, 10), (0, 1)) + ] + + fft_functions = ['fftn', 'ifftn', 'irfftn'] + # Real-only functions + if not dtype.is_complex: + fft_functions += ['rfftn'] + + for input_ndim, s, dim in transform_desc: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + input = torch.randn(*shape, device=device, dtype=dtype) + for fname, norm in product(fft_functions, norm_modes): + torch_fn = getattr(torch.fft, fname) + numpy_fn = getattr(np.fft, fname) + + def fn(t: torch.Tensor, s: Optional[List[int]], dim: Optional[List[int]], norm: Optional[str]): + return torch_fn(t, s, dim, norm) + + torch_fns = (torch_fn, torch.jit.script(fn)) + + expected = numpy_fn(input.cpu().numpy(), s, dim, norm) + exact_dtype = dtype in (torch.double, torch.complex128) + for fn in torch_fns: + actual = fn(input, s, dim, norm) + self.assertEqual(actual, expected, exact_dtype=exact_dtype) + + @skipCUDAIfRocm + @skipCPUIfNoMkl + @onlyOnCPUAndCUDA + @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + def test_fftn_round_trip(self, device, dtype): + norm_modes = (None, "forward", "backward", "ortho") + + # input_ndim, dim + transform_desc = [ + *product(range(2, 5), (None, (0,), (0, -1))), + *product(range(2, 5), (None,)), + (7, None), + (5, (1, 3, 4)), + (3, (0, -1)), + (3, (1,)), + (1, 0), + ] + + fft_functions = [(torch.fft.fftn, torch.fft.ifftn)] + + # Real-only functions + if not dtype.is_complex: + fft_functions += [(torch.fft.rfftn, torch.fft.irfftn)] + + for input_ndim, dim in transform_desc: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + x = torch.randn(*shape, device=device, dtype=dtype) + + for (forward, backward), norm in product(fft_functions, norm_modes): + if isinstance(dim, tuple): + s = [x.size(d) for d in dim] + else: + s = x.size() if dim is None else x.size(dim) + + kwargs = {'s': s, 'dim': dim, 'norm': norm} + y = backward(forward(x, **kwargs), **kwargs) + # For real input, ifftn(fftn(x)) will convert to complex + self.assertEqual(x, y, exact_dtype=( + forward != torch.fft.fftn or x.is_complex())) + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @dtypes(torch.double, torch.complex128) # gradcheck requires double + def test_fftn_backward(self, device, dtype): + # input_ndim, s, dim + transform_desc = [ + *product((2, 3), (None,), (None, (0,), (0, -1))), + *product((2, 3), (None, (4, 10)), (None,)), + (4, None, None), + (3, (10, 10), (0, 1)), + (2, (1, 1), (0, 1)), + (2, None, (1,)), + (1, None, (0,)), + (1, (11,), (0,)), + ] + norm_modes = (None, "forward", "backward", "ortho") + + fft_functions = ['fftn', 'ifftn', 'irfftn'] + # Real-only functions + if not dtype.is_complex: + fft_functions += ['rfftn'] + + for input_ndim, s, dim in transform_desc: + shape = itertools.islice(itertools.cycle(range(4, 9)), input_ndim) + input = torch.randn(*shape, device=device, dtype=dtype) + + for fname, norm in product(fft_functions, norm_modes): + torch_fn = getattr(torch.fft, fname) + + # Workaround for gradcheck's poor support for complex input + # Use real input instead and put view_as_complex into the graph + if dtype.is_complex: + def test_fn(x): + return torch_fn(torch.view_as_complex(x), s, dim, norm) + inputs = (torch.view_as_real(input).detach().requires_grad_(),) + else: + def test_fn(x): + return torch_fn(x, s, dim, norm) + inputs = (input.detach().requires_grad_(),) + + self.assertTrue(torch.autograd.gradcheck(test_fn, inputs)) + + @skipCUDAIfRocm + @skipCPUIfNoMkl + @onlyOnCPUAndCUDA + def test_fftn_invalid(self, device): + a = torch.rand(10, 10, 10, device=device) + fft_funcs = (torch.fft.fftn, torch.fft.ifftn, + torch.fft.rfftn, torch.fft.irfftn) + + for func in fft_funcs: + with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + func(a, dim=(0, 1, 0)) + + with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + func(a, dim=(2, -1)) + + with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"): + func(a, s=(1,), dim=(0, 1)) + + with self.assertRaisesRegex(IndexError, "Dimension out of range"): + func(a, dim=(3,)) + + with self.assertRaisesRegex(RuntimeError, "tensor only has 3 dimensions"): + func(a, s=(10, 10, 10, 10)) + + c = torch.complex(a, a) + with self.assertRaisesRegex(RuntimeError, "Expected a real input"): + torch.fft.rfftn(c) + + # Helper functions + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @dtypes(torch.float, torch.double) + def test_fftfreq_numpy(self, device, dtype): + test_args = [ + *product( + # n + range(1, 20), + # d + (None, 10.0), + ) + ] + + functions = ['fftfreq', 'rfftfreq'] + + for fname in functions: + torch_fn = getattr(torch.fft, fname) + numpy_fn = getattr(np.fft, fname) + + for n, d in test_args: + args = (n,) if d is None else (n, d) + expected = numpy_fn(*args) + actual = torch_fn(*args, device=device, dtype=dtype) + self.assertEqual(actual, expected, exact_dtype=False) + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + def test_fftshift_numpy(self, device, dtype): + test_args = [ + # shape, dim + *product(((11,), (12,)), (None, 0, -1)), + *product(((4, 5), (6, 6)), (None, 0, (-1,))), + *product(((1, 1, 4, 6, 7, 2),), (None, (3, 4))), + ] + + functions = ['fftshift', 'ifftshift'] + + for shape, dim in test_args: + input = torch.rand(*shape, device=device, dtype=dtype) + input_np = input.cpu().numpy() + + for fname in functions: + torch_fn = getattr(torch.fft, fname) + numpy_fn = getattr(np.fft, fname) + + expected = numpy_fn(input_np, axes=dim) + actual = torch_fn(input, dim=dim) + self.assertEqual(actual, expected) + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @dtypes(torch.float, torch.double) + def test_fftshift_frequencies(self, device, dtype): + for n in range(10, 15): + sorted_fft_freqs = torch.arange(-(n // 2), n - (n // 2), + device=device, dtype=dtype) + x = torch.fft.fftfreq(n, d=1 / n, device=device, dtype=dtype) + + # Test fftshift sorts the fftfreq output + shifted = torch.fft.fftshift(x) + self.assertTrue(torch.allclose(shifted, shifted.sort().values)) + self.assertEqual(sorted_fft_freqs, shifted) + + # And ifftshift is the inverse + self.assertEqual(x, torch.fft.ifftshift(shifted)) + # Legacy fft tests def _test_fft_ifft_rfft_irfft(self, device, dtype): def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x): diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py index 407ea03acda6..86dafa3903dd 100644 --- a/test/test_static_runtime.py +++ b/test/test_static_runtime.py @@ -106,7 +106,8 @@ def test_multihead_attention_layer(self): DROPOUT = 0.1 device = torch.device("cpu") attention = MultiHeadAttentionLayer(HID_DIM, HEADS, DROPOUT, device).to(device) - src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device) + with torch.no_grad(): + src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device) src_mask = (src > 0)[:, :, 0].unsqueeze(1).unsqueeze(2).to(device) attention.eval() @@ -129,8 +130,9 @@ def test_mlp(self): bot_l_acc = StaticRuntime(bot_l) top_l = create_mlp(ln_top, sigmoid_top) top_l_acc = StaticRuntime(top_l) - bot_inp = torch.randn(2048, 512) # torch.Size([2048, 512]) - top_inp = torch.randn(2048, 100) # torch.Size([2048, 100]) + with torch.no_grad(): + bot_inp = torch.randn(2048, 512) # torch.Size([2048, 512]) + top_inp = torch.randn(2048, 100) # torch.Size([2048, 100]) ref_bot = bot_l(bot_inp) acc_bot = bot_l_acc(bot_inp)[0] torch.testing.assert_allclose(acc_bot, ref_bot) @@ -138,8 +140,9 @@ def test_mlp(self): acc_top = top_l_acc(top_inp)[0] torch.testing.assert_allclose(acc_top, ref_top) for _ in range(5): - bot_inp = torch.randn(2048, 512) # torch.Size([2048, 512]) - top_inp = torch.randn(2048, 100) # torch.Size([2048, 100]) + with torch.no_grad(): + bot_inp = torch.randn(2048, 512) # torch.Size([2048, 512]) + top_inp = torch.randn(2048, 100) # torch.Size([2048, 100]) ref_bot = bot_l(bot_inp) acc_bot = bot_l_acc(bot_inp)[0] torch.testing.assert_allclose(acc_bot, ref_bot) @@ -147,13 +150,13 @@ def test_mlp(self): acc_top = top_l_acc(top_inp)[0] torch.testing.assert_allclose(acc_top, ref_top) - # def test_trivial_graph(self): - # s = torch.full((2, 2), 2) - # tg = torch.jit.script(trivial_graph) - # o_ref = tg(s, s, s) - # tg_a = StaticRuntime(tg) - # o_test = tg_a(s, s, s)[0] - # torch.testing.assert_allclose(o_ref, o_test) + def test_trivial_graph(self): + s = torch.full((2, 2), 2) + tg = torch.jit.script(trivial_graph) + o_ref = tg(s, s, s) + tg_a = StaticRuntime(tg) + o_test = tg_a(s, s, s)[0] + torch.testing.assert_allclose(o_ref, o_test) if __name__ == "__main__": diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index ab6eae83568e..d9e0f59a5210 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -6,7 +6,7 @@ import torch from torch.testing._internal.common_utils import \ - (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, suppress_warnings, + (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, TEST_WITH_ROCM, suppress_warnings, torch_to_numpy_dtype_dict, slowTest) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, @@ -1047,7 +1047,9 @@ def test_logspace_special_steps(self, device, dtype): self._test_logspace_base2(device, dtype, steps=steps) @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) - @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False)) + @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) + if TEST_WITH_ROCM + else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False))) def test_logspace(self, device, dtype): _from = random.random() to = _from + random.random() diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 143c6dab91d2..739957569962 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -1046,18 +1046,18 @@ def easy(x, y): # FIXME: interp.elapsed_value() also increments due to simplifier assert llvm.elapsed_value() == 1 or interp.elapsed_value() > 1 - def test_unsqueeze(self): + def test_unsqueeze(self, N=256): def easy(x, y): a = torch.unsqueeze(x, 0) b = torch.unsqueeze(y, 0) return a + b - traced = torch.jit.trace(easy, (torch.ones(1024, 1024), torch.zeros(1024, 1024))) + traced = torch.jit.trace(easy, (torch.ones(N, N), torch.zeros(N, N))) llvm = LLVMCodeGenExecuted() interp = SimpleIREvalExecuted() - a = torch.rand(1024, 1024) + a = torch.rand(N, N) x = traced(a, a) npr = np.expand_dims(a, 0) npr = npr + npr diff --git a/test/test_torch.py b/test/test_torch.py index dd012305ad56..7e4fe38fdc97 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -41,11 +41,10 @@ from typing import Dict, List, Tuple, Union import torch.backends.quantized import torch.testing._internal.data -from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, \ +from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, with_tf32_off, \ _get_torch_cuda_version, TEST_MAGMA - # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings load_tests = load_tests @@ -59,7 +58,7 @@ SIZE = 100 -AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() +AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() # Wrap base test class into a class to hide it from testing # See https://stackoverflow.com/a/25695512 @@ -230,10 +229,6 @@ def test_linear_algebra_scalar_raises(self) -> None: s = torch.tensor(7) self.assertRaises(RuntimeError, lambda: torch.mv(m, s)) self.assertRaises(RuntimeError, lambda: torch.addmv(v, m, s)) - self.assertRaises(RuntimeError, lambda: torch.ger(v, s)) - self.assertRaises(RuntimeError, lambda: torch.ger(s, v)) - self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s)) - self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v)) @unittest.skipIf(not TEST_SCIPY, "Scipy not found") def test_mvlgamma(self): @@ -659,6 +654,13 @@ def test_copy_transpose(self): self.assertEqual(y[:, 0], range(100)) self.assertEqual(y[:, 40], range(4000, 4100)) + # Validates regression reported in https://github.com/pytorch/pytorch/issues/45269 + x = torch.arange(100 * 100).reshape(100, 100).to(dtype=torch.cfloat).t() + y = torch.empty(100, 100, dtype=torch.cfloat) + y.copy_(x) + self.assertEqual(y[:, 0], range(100)) + self.assertEqual(y[:, 40], range(4000, 4100)) + def test_device(self): cpu = torch.device('cpu') self.assertEqual('cpu', str(cpu)) @@ -4698,6 +4700,22 @@ def add_neg_dim_tests(): class TestTorchDeviceType(TestCase): exact_dtype = True + @onlyCPU + def test_set_deterministic_beta_warning(self, device): + det = torch.is_deterministic() + try: + # Ensures setting to false does not throw a warning + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + torch.set_deterministic(False) + self.assertEqual(len(w), 0) + + # Setting set_deterministic(True) throws a warning once per process + with self.maybeWarnsRegex(UserWarning, "torch.set_deterministic is in beta"): + torch.set_deterministic(True) + finally: + torch.set_deterministic(det) + # Tests that trying to add, inplace, a CUDA tensor to a CPU tensor # throws the correct error message @onlyCUDA @@ -4789,6 +4807,10 @@ def test_abs_angle_complex_to_float(self, device, dtype): if fn_name == 'abs': torch_inplace_method = getattr(torch.Tensor, fn_name + "_") np_fn(a, out=a) + if dtype.is_complex: + with self.assertRaisesRegex(RuntimeError, "In-place abs is not supported for complex tensors."): + torch_inplace_method(t) + return torch_inplace_method(t) self.assertEqual(torch.from_numpy(a), t.cpu()) @@ -6306,60 +6328,38 @@ def test_heaviside_complex(self, device, dtypes): def test_logical_not(self, device, dtype): data = [10, 1, 0.3, 0, -0.3, -1, -10] a = torch.tensor(data, dtype=dtype, device=device) - - # do this before constructing the numpy array because np can't construct - # bfloat16 tensors. Can we define our own dtype in NumPy so testing would be easier? - if dtype == torch.bfloat16 or dtype.is_complex: - self.assertRaises(RuntimeError, lambda: a.logical_not()) - self.assertRaises(RuntimeError, lambda: a.logical_not_()) - raise unittest.SkipTest('logical_not not supported on {}'.format(dtype)) - - a_np = np.array(data, dtype=torch_to_numpy_dtype_dict[dtype]) - self.assertEqual(np.logical_not(a_np), torch.logical_not(a).to('cpu')) - self.assertEqual(np.logical_not(a_np, out=a_np), a.logical_not_().to('cpu')) + if dtype == torch.bfloat16: # numpy doesn't support these dtypes + result = [False, False, False, True, False, False, False] + self.assertEqual(torch.logical_not(a), torch.tensor(result, dtype=torch.bool, device=device)) + else: + a_np = np.array(data, dtype=torch_to_numpy_dtype_dict[dtype]) + self.assertEqual(np.logical_not(a_np), torch.logical_not(a).to('cpu')) + self.assertEqual(np.logical_not(a_np, out=a_np), a.logical_not_().to('cpu')) @unittest.skipIf(not TEST_NUMPY, 'Numpy not found') - @dtypes(*list(product(torch.testing.get_all_dtypes(), - torch.testing.get_all_dtypes()))) + @dtypes(*product(torch.testing.get_all_dtypes(), + torch.testing.get_all_dtypes())) def test_logical_not_out(self, device, dtypes): dtype = dtypes[0] out_dtype = dtypes[1] data = [10, 1, 0.3, 0, -0.3, -1, -10] a = torch.tensor(data, dtype=dtype, device=device) - out = torch.empty(a.shape, dtype=out_dtype, device=device) - - if (dtype == torch.bfloat16 or dtype.is_complex or - out_dtype == torch.bfloat16 or out_dtype.is_complex): - self.assertRaises(RuntimeError, lambda: torch.logical_not(a, out=out)) - raise unittest.SkipTest('logical_not not supported on {}'.format(out_dtype)) - - out_np = np.empty(a.shape, dtype=torch_to_numpy_dtype_dict[out_dtype]) - - self.assertEqual(a, a.cpu().numpy()) - torch.logical_not(a, out=out) - np.logical_not(a.cpu().numpy(), out=out_np) - self.assertEqual(out_np, out.to('cpu')) + out = torch.empty_like(a, dtype=out_dtype, device=device) + if torch.bfloat16 in dtypes: # numpy doesn't support these dtypes + result = [not i for i in a] + self.assertEqual(torch.logical_not(a, out=out), torch.tensor(result, dtype=out_dtype, device=device)) + else: + out_np = np.empty(a.shape, dtype=torch_to_numpy_dtype_dict[out_dtype]) + self.assertEqual(a, a.cpu().numpy()) + torch.logical_not(a, out=out) + np.logical_not(a.cpu().numpy(), out=out_np) + self.assertEqual(out_np, out.to('cpu')) def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): expected_res = torch.tensor(expected_res_, dtype=dtypes[0], device=device) a = torch.tensor(a_, dtype=dtypes[0], device=device) b = torch.tensor(b_, dtype=dtypes[1], device=device) - # Skip bfloat16 on CUDA. Remove this after bfloat16 is supported on CUDA. - # After type promotion of bfloat16 is supported, some bfloat16 logical operation will go through on - # CUDA as long as the two tensors are promoted to a supported type. - # TODO: Remove this once logical operators are improved to take care of bfloat16. - if self.device_type == 'cuda' and torch.bfloat16 in dtypes: - if torch.promote_types(dtypes[0], dtypes[1]) == torch.bfloat16: - with self.assertRaises(RuntimeError): - getattr(a, op)(b) - return - - if dtypes[0].is_complex or dtypes[1].is_complex: - with self.assertRaises(RuntimeError): - getattr(a, op)(b) - return - # new tensor self.assertEqual(expected_res.bool(), getattr(a, op)(b)) # out @@ -6374,18 +6374,6 @@ def _test_logical(self, device, dtypes, op, a_, b_, expected_res_): getattr(a, op + '_')(b) return - # TODO: remove when logical ops support bfloat16 on CUDA. - if self.device_type == 'cuda' and dtypes[0] == torch.bfloat16: - with self.assertRaises(RuntimeError): - getattr(a, op + '_')(b) - return - - # TODO: remove when complex ops are supported - if dtypes[0].is_complex: - with self.assertRaises(RuntimeError): - getattr(a, op + '_')(b) - return - getattr(a, op + '_')(b) self.assertEqual(expected_res, a) @@ -7033,6 +7021,9 @@ def test_matrix_exp_boundary_cases(self, device, dtype): @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double) + # Although tf32 is always disabled on matrix_exp, this test uses matmul, + # which has tf32 on by default + @with_tf32_off def test_matrix_exp_analytic(self, device, dtype): # check zero matrix x = torch.zeros(20, 20, dtype=dtype, device=device) @@ -7174,6 +7165,9 @@ def run_test(*n): @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float, torch.double) + # Although tf32 is always disabled on matrix_exp, this test uses matmul, + # which has tf32 on by default + @with_tf32_off def test_matrix_exp_compare_with_taylor(self, device, dtype): def normalize_to_1_operator_norm(sample, desired_norm): @@ -7758,14 +7752,29 @@ def cholesky_test_helper(n, batchsize, device, upper): for upper, batchsize in product([True, False], [262144, 524288]): cholesky_test_helper(2, batchsize, device, upper) + @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4}) @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_cholesky_batched(self, device, dtype): - from torch.testing._internal.common_utils import random_symmetric_pd_matrix + from torch.testing._internal.common_utils import \ + (random_symmetric_pd_matrix, + random_fullrank_matrix_distinct_singular_value) def cholesky_test_helper(n, batch_dims, upper): - A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device) + # This is a workaround while there is no support for complex random_symmetric_pd_matrix + if dtype.is_complex: + real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + A_real = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device) + A_imag = random_fullrank_matrix_distinct_singular_value(n, *batch_dims, dtype=real_dtype, device=device) + A = A_real + 1j * A_imag + # There is no support for complex batched matmul yet + matmul_list = [] + for mat in A.contiguous().view(-1, n, n): + matmul_list.append(mat @ mat.t().conj()) + A = torch.stack(matmul_list).view(*batch_dims, n, n) + else: + A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device) cholesky_exp = torch.stack([m.cholesky(upper=upper) for m in A.reshape(-1, n, n)]) cholesky_exp = cholesky_exp.reshape_as(A) self.assertEqual(cholesky_exp, torch.cholesky(A, upper=upper)) @@ -7773,26 +7782,38 @@ def cholesky_test_helper(n, batch_dims, upper): for upper, batchsize in product([True, False], [(3,), (3, 4), (2, 3, 4)]): cholesky_test_helper(3, batchsize, upper) + @precisionOverride({torch.float32: 1e-4, torch.complex64: 1e-4}) @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_cholesky(self, device, dtype): - x = torch.rand(10, 10, dtype=dtype, device=device) + 1e-1 - A = torch.mm(x, x.t()) + from torch.testing._internal.common_utils import \ + (random_symmetric_pd_matrix, + random_fullrank_matrix_distinct_singular_value) + + # This is a workaround while there is no support for complex random_symmetric_pd_matrix + if dtype.is_complex: + real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + A_real = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device) + A_imag = random_fullrank_matrix_distinct_singular_value(10, dtype=real_dtype, device=device) + A = A_real + 1j * A_imag + A = A @ A.t().conj() + else: + A = random_symmetric_pd_matrix(10, dtype=dtype, device=device) # default Case C = torch.cholesky(A) - B = torch.mm(C, C.t()) + B = torch.mm(C, C.t().conj()) self.assertEqual(A, B, atol=1e-14, rtol=0) # test Upper Triangular U = torch.cholesky(A, True) - B = torch.mm(U.t(), U) + B = torch.mm(U.t().conj(), U) self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (upper) did not allow rebuilding the original matrix') # test Lower Triangular L = torch.cholesky(A, False) - B = torch.mm(L, L.t()) + B = torch.mm(L, L.t().conj()) self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (lower) did not allow rebuilding the original matrix') def test_view(self, device): @@ -9536,20 +9557,26 @@ def test_rpow(self, device): assert m.dim() == 0, "m is intentionally a scalar" self.assertEqual(torch.pow(2, m), 2**m) + @precisionOverride({torch.float32: 1e-5, torch.complex64: 1e-5}) @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypesIfCPU(torch.float32, torch.float64, torch.complex64, torch.complex128) + @dtypesIfCUDA(torch.float32, torch.float64) def test_symeig(self, device, dtype): - from torch.testing._internal.common_utils import random_symmetric_matrix + from torch.testing._internal.common_utils import random_hermitian_matrix def run_test(dims, eigenvectors, upper): - x = random_symmetric_matrix(*dims, dtype=dtype, device=device) - oute = torch.empty(dims[1:] + dims[:1], dtype=dtype, device=device) + x = random_hermitian_matrix(*dims, dtype=dtype, device=device) + if dtype.is_complex: + real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + else: + real_dtype = dtype + oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device) outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device) torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv)) if eigenvectors: - x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute)), outv.transpose(-2, -1)) + x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute.to(dtype))), outv.transpose(-2, -1).conj()) self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T') else: eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper) @@ -9561,14 +9588,14 @@ def run_test(dims, eigenvectors, upper): self.assertEqual(resv, outv, msg="outputs of symeig and symeig with out don't match") # test non-contiguous - x = random_symmetric_matrix(*dims, dtype=dtype, device=device) + x = random_hermitian_matrix(*dims, dtype=dtype, device=device) n_dim = len(dims) + 1 # Reverse the batch dimensions and the matrix dimensions and then concat them x = x.permute(tuple(range(n_dim - 3, -1, -1)) + (n_dim - 1, n_dim - 2)) assert not x.is_contiguous(), "x is intentionally non-contiguous" rese, resv = torch.symeig(x, eigenvectors=eigenvectors, upper=upper) if eigenvectors: - x_recon = torch.matmul(torch.matmul(resv, torch.diag_embed(rese)), resv.transpose(-2, -1)) + x_recon = torch.matmul(torch.matmul(resv, torch.diag_embed(rese.to(dtype))), resv.transpose(-2, -1).conj()) self.assertEqual(x, x_recon, atol=1e-8, rtol=0, msg='Incorrect reconstruction using V @ diag(e) @ V.T') else: eigvals, _ = torch.symeig(x, eigenvectors=True, upper=upper) @@ -9579,6 +9606,25 @@ def run_test(dims, eigenvectors, upper): for batch_dims, eigenvectors, upper in product(batch_dims_set, (True, False), (True, False)): run_test((5,) + batch_dims, eigenvectors, upper) + # TODO: once there is more support for complex dtypes on GPU, they shall be added to above test + # particularly when RuntimeError: _th_bmm_out not supported on CUDAType for ComplexFloat is fixed + @unittest.expectedFailure + @onlyCUDA + @skipCUDAIfNoMagma + @dtypes(torch.complex64, torch.complex128) + def test_symeig_complex_xfailed(self, device, dtype): + from torch.testing._internal.common_utils import random_hermitian_matrix + + dims = (5, 3) + x = random_hermitian_matrix(*dims, dtype=dtype, device=device) + real_dtype = torch.float32 if dtype is torch.complex64 else torch.float64 + oute = torch.empty(dims[1:] + dims[:1], dtype=real_dtype, device=device) + outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device) + torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv)) + + x_recon = torch.matmul(torch.matmul(outv, torch.diag_embed(oute.to(dtype))), outv.transpose(-2, -1).conj()) + self.assertEqual(x, x_recon, atol=1e-8, rtol=0) + @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.double) @@ -10784,15 +10830,6 @@ def assert_tuple_empty(tup, dim): self.assertEqual(1, len(z)) self.assertEqual(torch.empty(0, dtype=torch.long), z[0]) - @onlyOnCPUAndCUDA - def test_nonzero_deprecated(self, device): - x = torch.randn((2, 3), device=device) - with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"): - x.nonzero() - - with self.maybeWarnsRegex(UserWarning, "This overload of nonzero is deprecated"): - torch.nonzero(x) - # TODO: add torch.complex64, torch.complex128 @dtypes(torch.float, torch.double) def test_normal(self, device, dtype): @@ -11291,6 +11328,19 @@ def test_signbit_complex(self, device, dtype): with self.assertRaisesRegex(RuntimeError, 'signbit is not implemented for complex tensors.'): torch.signbit(t, out=out) + @dtypes(torch.cfloat, torch.cdouble) + def test_sgn(self, device, dtype): + x = torch.randn(100, dtype=dtype) + angle = x.angle() + out = x.sgn() + self.assertEqual(out.angle(), angle) + self.assertEqual(out.abs(), torch.ones_like(x).real) + + x_out = torch.empty_like(x) + torch.sgn(x, out=x_out) + self.assertEqual(x_out.angle(), angle) + self.assertEqual(x_out.abs(), torch.ones_like(x).real) + @dtypes(*(torch.testing.get_all_dtypes(include_bool=False))) def test_signbit_non_boolean_output(self, device, dtype): # test non-boolean tensors as the `out=` parameters @@ -11648,6 +11698,24 @@ def test_add(self, device): m2 = torch.tensor([3., 4.], dtype=torch.bfloat16) self.assertEqual(m1 + m2, torch.tensor([4., 6.], dtype=torch.bfloat16)) + # different alpha types + m1 = torch.tensor([2 + 3j, 4 + 5j], dtype=torch.complex64, device=device) + m2 = torch.tensor([4 + 5j, 2 + 3j], dtype=torch.complex64, device=device) + # add complex numbers with float alpha + res = torch.add(m1, m2, alpha=0.1) + expected = torch.tensor([2.4000 + 3.5000j, 4.2000 + 5.3000j], dtype=torch.complex64, device=device) + self.assertEqual(res, expected) + + # add complex numbers with complex alpha + res = torch.add(m1, m2, alpha=complex(0.1, 0.2)) + expected = torch.tensor([1.4000 + 4.3000j, 3.6000 + 5.7000j], dtype=torch.complex64, device=device) + self.assertEqual(res, expected) + + # add complex numbers with integer alpha + res = torch.add(m1, m2, alpha=2) + expected = torch.tensor([10. + 13.j, 8. + 11.j], dtype=torch.complex64, device=device) + self.assertEqual(res, expected) + # mismatched alpha m1 = torch.tensor([1], dtype=torch.int8, device=device) m2 = torch.tensor([2], dtype=torch.int8, device=device) @@ -11658,6 +11726,15 @@ def test_add(self, device): r"For integral input tensors, argument alpha must not be a floating point number\.", lambda: torch.add(m1, m2, alpha=1.0)) + # mismatched alpha, float / double tensor and complex alpha + m1 = torch.tensor([3., 4.], device=device) + m2 = torch.tensor([4., 3.], device=device) + self.assertRaises(RuntimeError, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2))) + + m1 = torch.tensor([3., 4.], dtype=torch.double, device=device) + m2 = torch.tensor([4., 3.], dtype=torch.double, device=device) + self.assertRaises(RuntimeError, lambda: torch.add(m1, m2, alpha=complex(0.1, 0.2))) + # complex m1 = torch.tensor((4.0000 + 4.0000j), dtype=torch.complex64) m2 = torch.tensor(4., dtype=torch.float64) @@ -12660,7 +12737,7 @@ def test_scatter_reduce_non_unique_index(self, device, dtype): input.scatter_(0, index, src, reduce=operation) self.assertEqual(input, result, msg=f"result: {result} input: {input} method: {str(operation)}") - @skipCUDAIfRocm + @skipCUDAIfRocm @onlyOnCPUAndCUDA @dtypesIfCUDA(*(torch.testing.get_all_complex_dtypes() + torch.testing.get_all_int_dtypes())) @@ -13019,10 +13096,6 @@ def gen_nontrivial_input(shape, dtype, device): dst2 = tensor.nonzero(as_tuple=False) dst3 = torch.empty([], dtype=torch.long, device=device) torch.nonzero(tensor, out=dst3) - self.assertRaisesRegex( - TypeError, - "received an invalid combination of arguments", - lambda: torch.nonzero(tensor, as_tuple=True, out=dst3)) if self.device_type != 'xla': # xla does not raise runtime error self.assertRaisesRegex( @@ -13048,6 +13121,37 @@ def gen_nontrivial_input(shape, dtype, device): self.assertEqual(tup1, np_result, atol=0, rtol=0) self.assertEqual(tup2, np_result, atol=0, rtol=0) + def test_nonzero_astuple_out(self, device): + t = torch.randn((3, 3, 3), device=device) + out = torch.empty_like(t, dtype=torch.long) + + with self.assertRaises(RuntimeError): + torch.nonzero(t, as_tuple=True, out=out) + + self.assertEqual(torch.nonzero(t, as_tuple=False, out=out), torch.nonzero(t, out=out)) + + # Verifies that JIT script cannot handle the as_tuple kwarg + # See Issue https://github.com/pytorch/pytorch/issues/45499. + def _foo(t): + tuple_result = torch.nonzero(t, as_tuple=True) + nontuple_result = torch.nonzero(t, as_tuple=False) + out = torch.empty_like(nontuple_result) + torch.nonzero(t, as_tuple=False, out=out) + return tuple_result, nontuple_result, out + + with self.assertRaises(RuntimeError): + scripted_foo = torch.jit.script(_foo) + + # Verifies that JIT tracing works fine + traced_foo = torch.jit.trace(_foo, t) + traced_tuple, traced_nontuple, traced_out = traced_foo(t) + expected_tuple = torch.nonzero(t, as_tuple=True) + expected_nontuple = torch.nonzero(t) + + self.assertEqual(traced_tuple, expected_tuple) + self.assertEqual(traced_nontuple, expected_nontuple) + self.assertEqual(traced_out, expected_nontuple) + @onlyOnCPUAndCUDA def test_nonzero_discontiguous(self, device): shape = (4, 4) @@ -13705,6 +13809,15 @@ def test_float_scalar_pow_float_tensor(self, device): for base in floats: self._test_pow(base, tensor) + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'Numpy not found') + @dtypes(*(torch.testing.get_all_dtypes(include_bool=False, include_bfloat16=False))) + def test_complex_scalar_pow_tensor(self, device, dtype): + complexes = [0.5j, 1. + 1.j, -1.5j, 2.2 - 1.6j] + tensor = torch.rand(100).to(dtype=dtype, device=device) + for base in complexes: + self._test_pow(base, tensor) + @unittest.skipIf(not TEST_NUMPY, 'Numpy not found') def test_tensor_pow_tensor(self, dev): def rotate(l, n): @@ -13812,15 +13925,6 @@ def call_torch_fn(*args, **kwargs): self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,))) self.assertEqual(t, fn(torch.addmv, t, (3, 0), (0,), test_out=True)) - # ger, addr - self.assertEqual((0, 0), fn(torch.ger, (0,), (0,)).shape) - self.assertEqual((5, 0), fn(torch.ger, (5,), (0,)).shape) - self.assertEqual((0, 4), fn(torch.ger, (0,), (4,)).shape) - - self.assertEqual((0, 0), fn(torch.addr, (0, 0), (0,), (0,)).shape) - self.assertEqual((5, 0), fn(torch.addr, (5, 0), (5,), (0,)).shape) - self.assertEqual((0, 4), fn(torch.addr, (0, 4), (0,), (4,)).shape) - # bmm, baddbmm self.assertEqual((0, 0, 0), fn(torch.bmm, (0, 0, 0), (0, 0, 0)).shape) self.assertEqual((3, 0, 5), fn(torch.bmm, (3, 0, 0), (3, 0, 5)).shape) @@ -14222,28 +14326,268 @@ def test_binary_op_scalar_device_unspecified(self, devices): self.assertEqual(y1.device, device_obj) self.assertEqual(y0, y1) - # Tests that CPU scalars (including zero dim tensors) can be used in - # binary operations with CUDA tensors. - @onlyCUDA - def test_cuda_cpu_scalar_binary_ops(self, device): - val_scalar = math.pi - val_tensor = torch.tensor(val_scalar) - for op in (operator.add, torch.add, - operator.sub, torch.sub, - operator.mul, torch.mul, - operator.truediv, torch.true_divide, - operator.floordiv, torch.floor_divide): - for tensor_val in (1, (1,)): - t_cuda = torch.tensor(tensor_val, device=device) - t_cpu = t_cuda.cpu() - for val in (val_scalar, val_tensor): - cpu_result = op(t_cpu, val) - cuda_result = op(t_cuda, val) - self.assertEqual(cpu_result, cuda_result) - - reverse_cpu_result = op(val, t_cpu) - reverse_cuda_result = op(val, t_cuda) - self.assertEqual(reverse_cpu_result, reverse_cuda_result) + def test_div_and_floordiv_vs_python(self, device): + # Tests torch division ops which can handle both arguments being + # scalars. + # NOTE: torch.floor_divide currently truncates instead of flooring. + # the quotient. See https://github.com/pytorch/pytorch/issues/43874. + def _scalar_helper(python_op, torch_op): + for a, b in product(range(-10, 10), range(-10, 10)): + for op in (lambda x: x * .5, lambda x: math.floor(x)): + a = op(a) + b = op(b) + + # Skips zero divisors + if b == 0: + continue + + expected = python_op(a, b) + + for op in (operator.truediv, torch.true_divide): + actual_scalar = torch_op(a, b) + + a_t = torch.tensor(a, device=device) + b_t = torch.tensor(b, device=device) + + actual_tensor = torch_op(a_t, b_t) + actual_first_tensor = torch_op(a_t, b) + actual_second_tensor = torch_op(a, b_t) + + self.assertEqual(actual_scalar, expected_div) + self.assertEqual(actual_tensor.item(), expected_div) + self.assertEqual(actual_first_tensor, actual_tensor) + self.assertEqual(actual_second_tensor, actual_tensor) + + _scalar_helper(operator.truediv, operator.truediv) + _scalar_helper(operator.truediv, torch.true_divide) + _scalar_helper(lambda a, b: math.trunc(a / b), operator.floordiv) + _scalar_helper(lambda a, b: math.trunc(a / b), torch.floor_divide) + + # NOTE: torch.floor_divide currently truncates instead of flooring. + # See https://github.com/pytorch/pytorch/issues/43874. + @onlyOnCPUAndCUDA + def test_div_and_floordiv_script_vs_python(self, device): + # Creates jitted functions of two tensors + def _wrapped_div(a, b): + return a / b + + def _wrapped_floordiv(a, b): + return a // b + + scripted_div = torch.jit.script(_wrapped_div) + scripted_floordiv = torch.jit.script(_wrapped_floordiv) + for a, b in product(range(-10, 10), range(-10, 10)): + for op in (lambda x: x * .5, lambda x: math.floor(x)): + a = op(a) + b = op(b) + + # Skips zero divisors + if b == 0: + continue + + expected_div = a / b + expected_truncdiv = math.trunc(a / b) + a_t = torch.tensor(a, device=device) + b_t = torch.tensor(b, device=device) + + self.assertEqual(scripted_div(a_t, b_t), expected_div) + self.assertEqual(scripted_floordiv(a_t, b_t), expected_truncdiv) + + # Creates jitted functions of one tensor + def _wrapped_div_scalar(a): + return a / 5 + + # NOTE: this will fail when given an integer input, since + # the JIT implements division as + # torch.reciprocal(a) * 5, and reciprocal is only + # implemented for float types. + def _wrapped_rdiv_scalar(a): + return 5 / a + + def _wrapped_floordiv_scalar(a): + return a // 5 + + # NOTE: this fails if the input is not an integer tensor + # See https://github.com/pytorch/pytorch/issues/45199 + def _wrapped_rfloordiv_scalar(a): + return 5 // a + + scripted_div_scalar = torch.jit.script(_wrapped_div_scalar) + scripted_rdiv_scalar = torch.jit.script(_wrapped_rdiv_scalar) + scripted_floordiv_scalar = torch.jit.script(_wrapped_floordiv_scalar) + scripted_rfloordiv_scalar = torch.jit.script(_wrapped_rfloordiv_scalar) + + for a in range(-10, 10): + for op in (lambda x: x * .5, lambda x: math.floor(x)): + a = op(a) + + a_t = torch.tensor(a, device=device) + + self.assertEqual(a / 5, scripted_div_scalar(a_t)) + self.assertEqual(math.trunc(a / 5), scripted_floordiv_scalar(a_t)) + + # Skips zero divisors + if a == 0: + continue + + if a_t.is_floating_point(): + self.assertEqual(5 / a, scripted_rdiv_scalar(a_t)) + else: + with self.assertRaises(RuntimeError): + scripted_rdiv_scalar(a_t) + + + # Handles Issue 45199 (see comment above) + if a_t.is_floating_point(): + with self.assertRaises(RuntimeError): + scripted_rfloordiv_scalar(a_t) + else: + self.assertEqual(5 // a, scripted_rfloordiv_scalar(a_t)) + + # NOTE: torch.floor_divide currently truncates instead of flooring + # the quotient. See https://github.com/pytorch/pytorch/issues/43874. + @onlyOnCPUAndCUDA + def test_idiv_and_ifloordiv_vs_python(self, device): + def _wrapped_idiv_tensor(a, b): + a /= b + return a + + def _wrapped_idiv_scalar(a): + a /= 5 + return a + + def _wrapped_true_divide__tensor(a, b): + a.true_divide_(b) + return a + + def _wrapped_true_divide__scalar(a): + a.true_divide_(5) + return a + + def _wrapped_floor_divide__tensor(a, b): + a.floor_divide_(b) + return a + + def _wrapped_floor_divide__scalar(a): + a.floor_divide_(5) + return a + + # The following functions are unsupported by the JIT + def _wrapped_ifloordiv_tensor(a, b): + a //= b + return a + + def _wrapped_ifloordiv_scalar(a): + a //= 5 + return a + + with self.assertRaises(torch.jit.frontend.NotSupportedError): + scripted_ifloordiv_tensor = torch.jit.script(_wrapped_ifloordiv_tensor) + + with self.assertRaises(torch.jit.frontend.NotSupportedError): + scripted_ifloordiv_scalar = torch.jit.script(_wrapped_ifloordiv_scalar) + + scripted_idiv_tensor = torch.jit.script(_wrapped_idiv_tensor) + scripted_idiv_scalar = torch.jit.script(_wrapped_idiv_scalar) + scripted_true_divide__tensor = torch.jit.script(_wrapped_true_divide__tensor) + scripted_true_divide__scalar = torch.jit.script(_wrapped_true_divide__scalar) + scripted_floor_divide__tensor = torch.jit.script(_wrapped_floor_divide__tensor) + scripted_floor_divide__scalar = torch.jit.script(_wrapped_floor_divide__scalar) + + for a, b in product(range(-10, 10), range(-10, 10)): + for op in (lambda x: x * .5, lambda x: math.floor(x)): + a = op(a) + b = op(b) + + # Skips zero divisors + if b == 0: + continue + + expected_idiv = a / b + expected_ifloordiv = a // b + expected_itruncdiv = math.trunc(a / b) + + a_t = torch.tensor(a, device=device) + b_t = torch.tensor(b, device=device) + + if a_t.is_floating_point(): + tmp0 = a_t.clone() + tmp0 /= b + + tmp1 = a_t.clone() + tmp1 /= b_t + + self.assertEqual(tmp0.item(), expected_idiv) + self.assertEqual(tmp1.item(), expected_idiv) + self.assertEqual(scripted_true_divide__tensor(a_t.clone(), b_t).item(), expected_idiv) + self.assertEqual(scripted_true_divide__scalar(a_t.clone()).item(), a / 5) + else: + tmp = a_t.clone() + with self.assertRaises(RuntimeError): + tmp /= b + with self.assertRaises(RuntimeError): + tmp /= b_t + with self.assertRaises(RuntimeError): + scripted_true_divide__tensor(tmp, b_t) + with self.assertRaises(RuntimeError): + scripted_true_divide__scalar(tmp) + + + if not a_t.is_floating_point() and b_t.is_floating_point(): + # Inplace modification fails because a float tensor is required + # if the divisor is a float tensor + with self.assertRaises(RuntimeError): + a_t.clone().floor_divide_(b_t) + with self.assertRaises(RuntimeError): + scripted_floor_divide_tensor(a_t.clone(), b_t) + tmp = a_t.clone() + with self.assertRaises(RuntimeError): + tmp //= b_t + else: + # Inplace modification is OK when both or neither tensor is + # a float tensor + self.assertEqual(a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv) + self.assertEqual(scripted_floor_divide__tensor(a_t.clone(), b_t).item(), expected_itruncdiv) + tmp = a_t.clone() + tmp //= b_t + self.assertEqual(tmp.item(), expected_itruncdiv) + + self.assertEqual(scripted_floor_divide__scalar(a_t), math.trunc(a / 5)) + + # Tests binary op equivalence with Python builtin ops + # Also tests that reverse operations are equivalent to forward ops + # NOTE: division ops are tested separately above + def test_binary_ops_with_scalars(self, device): + for ops in ((operator.add, torch.add), + (operator.sub, torch.sub), + (operator.mul, torch.mul), + (operator.truediv, torch.div)): + python_op, torch_op = ops + + for a, b in product(range(-10, 10), range(-10, 10)): + for op in (lambda x: x * .5, lambda x: math.floor(x)): + a = op(a) + b = op(b) + + # Skips zero divisors + if b == 0 or a == 0: + continue + + a_tensor = torch.tensor(a, device=device) + b_tensor = torch.tensor(b, device=device) + a_tensor_cpu = a_tensor.cpu() + b_tensor_cpu = b_tensor.cpu() + vals = (a, b, a_tensor, b_tensor, a_tensor_cpu, b_tensor_cpu) + + for args in product(vals, vals): + first, second = args + + first_scalar = first if not isinstance(first, torch.Tensor) else first.item() + second_scalar = second if not isinstance(second, torch.Tensor) else second.item() + expected = python_op(first_scalar, second_scalar) + + self.assertEqual(expected, python_op(first, second)) + self.assertEqual(expected, torch_op(first, second)) @onlyCUDA def test_ceil_out_mismatch(self, device): @@ -14404,8 +14748,7 @@ def test_topk_integral(self, device, dtype): self.assertEqual(sort_topk, topk[0]) # check values self.assertEqual(sort_topk, a[topk[1]]) # check indices - @dtypesIfCUDA(*([torch.half, torch.float, torch.double] - + ([torch.bfloat16] if TEST_WITH_ROCM else []))) + @dtypesIfCUDA(*torch.testing.get_all_fp_dtypes()) @dtypes(torch.float, torch.double) def test_topk_nonfinite(self, device, dtype): x = torch.tensor([float('nan'), float('inf'), 1e4, 0, -1e4, -float('inf')], device=device, dtype=dtype) @@ -14712,6 +15055,8 @@ def _test_helper(x, y, bias, memory_format): lambda x, y: x.logit_(1e-6), lambda x, y: x.sign(), lambda x, y: x.sign_(), + lambda x, y: x.sgn(), + lambda x, y: x.sgn_(), lambda x, y: x.sin(), lambda x, y: x.sin_(), lambda x, y: x.sinh(), @@ -16317,52 +16662,6 @@ def tracker(worker): ---(input size: {:4}, eigenpairs:{:2}, units: relative error, maxiter={:4})--- '''.format(tol, eq_err, eq_err_general, iters1, eq_err_scipy, eq_err_general_scipy, iters2, m, k, niter)) - @slowTest - @onlyCPU - @dtypes(torch.bfloat16, torch.float, torch.double) - def test_ger(self, device, dtype): - def run_test(v0, v1): - res0 = torch.ger(v0, v1) - res1 = torch.zeros(100, 100, dtype=dtype, device=device) - for i in range(100): - for j in range(100): - res1[i, j] = v0[i] * v1[j] - self.assertEqual(res0, res1) - - v0 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype) - v1 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype) - run_test(v0, v1) - - # Tests 0-strided - v0 = torch.randn(1, dtype=torch.float, device=device).expand(100).to(dtype=dtype) - v1 = torch.randn(100, dtype=torch.float, device=device).to(dtype=dtype) - run_test(v0, v1) - - @slowTest - @onlyCPU - @dtypes(torch.bfloat16, torch.float, torch.double) - def test_addr(self, device, dtype): - def run_test(m, v1, v2, m_transform=lambda x: x): - m = m_transform(m.clone()) - ref = m.clone() - torch.addr(m, v1, v2, out=m) - for i in range(m.size(0)): - for j in range(m.size(1)): - ref[i, j] += v1[i] * v2[j] - self.assertEqual(m, ref) - - for h, w in [(100, 110), (1, 20), (200, 2)]: - m = torch.randn(h, w, dtype=torch.float, device=device).to(dtype=dtype) - v1 = torch.randn(h, dtype=torch.float, device=device).to(dtype=dtype) - v2 = torch.randn(w, dtype=torch.float, device=device).to(dtype=dtype) - run_test(m, v1, v2) - # test transpose - run_test(m, v2, v1, lambda x: x.transpose(0, 1)) - # test 0 strided - v1 = torch.randn(1, dtype=torch.float, device=device).expand(h).to(dtype=dtype) - run_test(m, v1, v2) - run_test(m, v2, v1, lambda x: x.transpose(0, 1)) - def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out=False): dtype = t.dtype numpy_dtype = dtype @@ -16388,7 +16687,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out= @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) + @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), + *([torch.float32, torch.float64, torch.bfloat16] + if TEST_WITH_ROCM else torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))) @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_addmv(self, device, dtype): @@ -16459,6 +16760,7 @@ def _test(row_major, incx, incy, lda_tail): @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) @dtypes(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes()) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + @tf32_on_and_off(0.05) def test_addmm(self, device, dtype): M = torch.randn(10, 25, device=device).to(dtype) m1 = torch.randn(10, 50, device=device).to(dtype) @@ -16834,6 +17136,15 @@ def test_div(self, device, dtype): atol=0.01, rtol=0) self.assertEqual(method(a1, a2), op(a1, a2)) + @dtypes(torch.bfloat16, torch.float) + def test_true_divide_out(self, device, dtype): + a1 = torch.tensor([4.2, 6.2], dtype=dtype, device=device) + a2 = torch.tensor([2., 2.], dtype=dtype, device=device) + res = torch.empty_like(a1) + self.assertEqual(torch.true_divide(a1, a2, out=res), + torch.tensor([2.1, 3.1], dtype=dtype, device=device), + atol=0.01, rtol=0) + @onlyCUDA @dtypes(torch.half) def test_divmul_scalar(self, device, dtype): @@ -16904,11 +17215,8 @@ def test_rdiv(self, device, dtype): else: x = torch.rand(100, device=device).add(1).mul(4).to(dtype) y = 30 / x - if dtype.is_floating_point or dtype.is_complex: - z = torch.tensor([30 / v.item() for v in x], dtype=dtype, device=device) - else: - z = torch.tensor([math.trunc(30. / v.item()) for v in x], dtype=dtype, device=device) - self.assertEqual(y, z) + z = torch.tensor([30 / v.item() for v in x], device=device) + self.assertEqual(y, z, exact_dtype=False) @onlyCPU @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) @@ -19630,8 +19938,6 @@ def test_movedim_view(self, device): # with _float_types when bfloat16 bringup is complete on all platforms _float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types -_complex_and_float_types2 = _float_types2 + _complex_types - _signed_types = [ torch.half, torch.float, torch.double, torch.int8, torch.short, torch.int, torch.long @@ -19642,10 +19948,20 @@ def test_movedim_view(self, device): torch.int8, torch.short, torch.int, torch.long ] +_integer_types = [ + torch.uint8, torch.int8, torch.int16, + torch.int32, torch.int64 +] + _cpu_types: List[torch.dtype] = [] _unsigned_types = [torch.uint8] +# Binary Float Ops +# Operators which use TensorIterator::binary_float_op +# These Ops promote integer inputs to Float. +binary_float_ops_inplace = ['atan2_', 'div_'] + # Helper values and functions for producing tensors and scalars to use in tensor op tests. # Tensor dimension sizes (Small, Medium, Large, Giant) _S = 5 @@ -19802,20 +20118,21 @@ def inner(self, device, dtype): ('pow', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d).abs()], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes()), ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], - 1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, [tf32_on_and_off(0.005)]), + 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, + _cpu_types, True, [tf32_on_and_off(0.01)]), ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), + 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True, + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-1, 1e-1, 1e-4, _complex_and_float_types2, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), + 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types, _cpu_types, True, + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addbmm_? is deprecated")]), ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2), + 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)), ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, + 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]), ('baddbmm', 'two_scalars', _small_3d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, + 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, [_wrap_maybe_warns("This overload of baddbmm_? is deprecated")]), ('bmm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False), @@ -19835,37 +20152,27 @@ def inner(self, device, dtype): [_wrap_maybe_warns("This overload of addcmul_? is deprecated")]), ('addmm', '', _medium_2d, lambda t, d: [_medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), - _cpu_types, True, [tf32_on_and_off(0.005)], 0, True), + _cpu_types, True, [tf32_on_and_off(0.01)], 0, True), ('addmm', 'scalar', _medium_2d, lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), ('addmm', 'two_scalars', _medium_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)], 1e-1, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM), _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), + [tf32_on_and_off(0.01), _wrap_maybe_warns("This overload of addmm_? is deprecated")]), ('addmv', '', _medium_1d, lambda t, d: [_medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, - True, [tf32_on_and_off(0.005)], 0, True), + True, [], 0, True), ('addmv', 'scalar', _medium_1d, - lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, + lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]), + [_wrap_maybe_warns("This overload of addmv_? is deprecated")]), ('addmv', 'two_scalars', _medium_1d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True, - [tf32_on_and_off(0.005), _wrap_maybe_warns("This overload of addmv_? is deprecated")]), - ('addr', '', _medium_2d, lambda t, d: [_medium_1d(t, d), _medium_1d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2), - ('addr', 'scalar', _medium_2d, - lambda t, d: [_number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, - [_wrap_maybe_warns("This overload of addr_? is deprecated")]), - ('addr', 'two_scalars', _medium_2d, - lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)], - 1e-2, 1e-1, 1e-4, _float_types2, _cpu_types, True, - [_wrap_maybe_warns("This overload of addr_? is deprecated")]), - ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _float_types), + [_wrap_maybe_warns("This overload of addmv_? is deprecated")]), + ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, 1e-5, _types, _types_no_half), ('angle', '', _small_3d, lambda t, d: [], 0, 0, 0, _types_no_half, [torch.bfloat16], False), ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3), ('fmod', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-3), @@ -20039,11 +20346,11 @@ def inner(self, device, dtype): ('transpose', 'neg_dim', _new_t((1, 2, 3, 4)), lambda t, d: [-1, -2], ), ('tolist', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('topk', 'dim_sort', _small_3d_unique, lambda t, d: [2, 1, False, True], - 1e-5, 1e-5, 1e-5, _types2, _cpu_types, False), + 1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), ('topk', 'neg_dim_sort', _small_3d_unique, lambda t, d: [2, -1, False, True], - 1e-5, 1e-5, 1e-5, _types2, _cpu_types, False), + 1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), ('topk', 'dim_desc_sort', _small_3d_unique, lambda t, d: [2, 1, True, True], - 1e-5, 1e-5, 1e-5, _types2, _cpu_types, False), + 1e-5, 1e-5, 1e-5, torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), ('trace', '', _medium_2d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _types, _cpu_types, False), ('tril', '', _medium_2d, lambda t, d: [],), ('tril', 'zero_stride', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), @@ -20103,7 +20410,7 @@ def inner(self, device, dtype): ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()), ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()), ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), - ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, + ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes() + _complex_types, [torch.bfloat16]), ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]), ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]), @@ -20162,6 +20469,15 @@ def fn(self, device, dtype) -> None: (isinstance(arg, torch.Tensor) and arg.dtype == torch.float) else arg for arg in device_args] + # Special case for binary float ops (binary ops that promote int to float) + if op_str in binary_float_ops_inplace and \ + 'inplace' in subtest_str and dtype in _integer_types: + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "): + cpu_result = getattr(cpu_tensor, op_str)(*cpu_args) + with self.assertRaisesRegex(RuntimeError, "result type Float can't be cast to "): + device_result = getattr(device_tensor, op_str)(*device_args) + return # Nothing more to check + # Runs the tensor op on CPU and device cpu_result = getattr(cpu_tensor, op_str)(*cpu_args) device_result = getattr(device_tensor, op_str)(*device_args) @@ -20531,7 +20847,7 @@ def _test_svd_helper(self, shape, some, col_maj, device, dtype): @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(*_float_types_no_half) + @dtypes(*(_float_types_no_half + _complex_types)) def test_svd_square(self, device, dtype): self._test_svd_helper((10, 10), True, False, device, dtype) diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py index 9ee90c7cbcd8..7f10915a5ac4 100644 --- a/test/test_type_promotion.py +++ b/test/test_type_promotion.py @@ -7,7 +7,7 @@ from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests, TEST_NUMPY, torch_to_numpy_dtype_dict) from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyOnCPUAndCUDA, - dtypes, onlyCPU) + dtypes, dtypesIfCUDA, onlyCPU) if TEST_NUMPY: import numpy as np @@ -958,6 +958,37 @@ def test_computation_ignores_out(self, device): self.assertEqual(result, a - b, exact_dtype=False) self.assertNotEqual(result, a.double() - b, exact_dtype=False) + @dtypesIfCUDA(*itertools.product(torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False), + torch.testing.get_all_dtypes(include_bfloat16=False, include_complex=False))) + @dtypes(*itertools.product(torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, + include_complex=False), + torch.testing.get_all_dtypes(include_half=False, include_bfloat16=False, + include_complex=False))) + def test_atan2_type_promotion(self, device, dtypes): + dtype1, dtype2 = dtypes + default_float = torch.get_default_dtype() + + def is_int(dtype): + return dtype in torch.testing.get_all_int_dtypes() + [torch.bool] + + def is_float(dtype): + return dtype in torch.testing.get_all_fp_dtypes(include_half=True, include_bfloat16=False) + + def get_binary_float_result_type(x, y): + dtype1 = x.dtype + dtype2 = y.dtype + if is_float(dtype1) and is_float(dtype2): + return torch.result_type(x, y) + elif is_float(dtype1) and is_int(dtype2): + return dtype1 + elif is_int(dtype1) and is_float(dtype2): + return dtype2 + elif is_int(dtype1) and is_int(dtype2): + return default_float + + x = torch.tensor(1, dtype=dtype1, device=device) + y = torch.tensor(2, dtype=dtype2, device=device) + self.assertEqual(get_binary_float_result_type(x, y), torch.atan2(x, y).dtype) instantiate_device_type_tests(TestTypePromotion, globals()) diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 09a3cbd583a7..ddc735199f2d 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -1,6 +1,7 @@ import math from itertools import product, chain from numbers import Number +import random import unittest @@ -377,6 +378,41 @@ def test_batch_vs_slicing(self, device, dtype, op): self.assertEqual(actual, expected) + @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False))) + def test_nan_to_num(self, device, dtype): + for contiguous in [False, True]: + x = make_tensor((64, 64), low=0., high=100., dtype=dtype, device=device) + + if dtype.is_floating_point: + # Add extremal values. + extremals = [float('nan'), float('inf'), -float('inf')] + for idx, extremal in zip(torch.randint(0, 63, (3,)), extremals): + x[idx, :] = extremal + + if not contiguous: + x = x.T + + # With args + nan = random.random() + posinf = random.random() * 5 + neginf = random.random() * 10 + + self.compare_with_numpy(lambda x: x.nan_to_num(nan=nan, posinf=posinf), + lambda x: np.nan_to_num(x, nan=nan, posinf=posinf), + x) + self.compare_with_numpy(lambda x: x.nan_to_num(posinf=posinf, neginf=neginf), + lambda x: np.nan_to_num(x, posinf=posinf, neginf=neginf), + x) + + # Out Variant + out = torch.empty_like(x) + result = torch.nan_to_num(x) + torch.nan_to_num(x, out=out) + self.assertEqual(result, out) + + result = torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf) + torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf) + self.assertEqual(result, out) instantiate_device_type_tests(TestUnaryUfuncs, globals()) diff --git a/test/test_utils.py b/test/test_utils.py index bf002541bebf..11b4337b4768 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,18 +4,20 @@ import shutil import random import tempfile +import textwrap import unittest import torch import torch.nn as nn import torch.utils.data import torch.cuda from torch.utils.checkpoint import checkpoint, checkpoint_sequential -import torch.utils._benchmark as benchmark_utils +import torch.utils.benchmark as benchmark_utils import torch.hub as hub from torch.autograd._functions.utils import check_onnx_broadcast from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings -from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS +from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, slowTest from urllib.error import URLError +import numpy as np # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -621,34 +623,293 @@ def test_timer(self): timer = benchmark_utils.Timer( stmt="torch.ones(())", ) - median = timer.blocked_autorange(min_run_time=0.1).median + sample = timer.timeit(5).median + self.assertIsInstance(sample, float) + + median = timer.blocked_autorange(min_run_time=0.01).median self.assertIsInstance(median, float) + # We set a very high threshold to avoid flakiness in CI. + # The internal algorithm is tested in `test_adaptive_timer` + median = timer.adaptive_autorange(threshold=0.5).median + + class _MockTimer: + _seed = 0 + + _timer_noise_level = 0.05 + _timer_cost = 100e-9 # 100 ns + + _function_noise_level = 0.05 + _function_costs = ( + ("pass", 8e-9), + ("cheap_fn()", 4e-6), + ("expensive_fn()", 20e-6), + ) + + def __init__(self, stmt, setup, timer, globals): + self._random_state = np.random.RandomState(seed=self._seed) + self._mean_cost = {k: v for k, v in self._function_costs}[stmt] + + def sample(self, mean, noise_level): + return max(self._random_state.normal(mean, mean * noise_level), 5e-9) + + def timeit(self, number): + return sum([ + # First timer invocation + self.sample(self._timer_cost, self._timer_noise_level), + + # Stmt body + self.sample(self._mean_cost * number, self._function_noise_level), + + # Second timer invocation + self.sample(self._timer_cost, self._timer_noise_level), + ]) + def test_adaptive_timer(self): - # Validate both on different sizes validate against blocked_autorange - # This looks for relative differences btetween orders of magnitude to - # provide a stable/portable test which is somewhat informative. - timer = benchmark_utils.Timer( - stmt="torch.sum(torch.ones((10,10)))", + class MockTimer(benchmark_utils.Timer): + _timer_cls = self._MockTimer + + def assert_reprs_match(measurement, expected): + measurement_repr = re.sub( + "object at 0x[0-9a-fA-F]+>", + "object at 0xXXXXXXXXXXXX>", + repr(measurement) + ) + self.assertEqual(measurement_repr, textwrap.dedent(expected).strip()) + + assert_reprs_match( + MockTimer("pass").blocked_autorange(min_run_time=10), + """ + + pass + Median: 7.98 ns + IQR: 0.52 ns (7.74 to 8.26) + 125 measurements, 10000000 runs per measurement, 1 thread""" ) - small = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0) - timer = benchmark_utils.Timer( - stmt="torch.sum(torch.ones((500,500)))", + + assert_reprs_match( + MockTimer("pass").adaptive_autorange(), + """ + + pass + Median: 7.86 ns + IQR: 0.71 ns (7.63 to 8.34) + 6 measurements, 1000000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("cheap_fn()").blocked_autorange(min_run_time=10), + """ + + cheap_fn() + Median: 3.98 us + IQR: 0.27 us (3.85 to 4.12) + 252 measurements, 10000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("cheap_fn()").adaptive_autorange(), + """ + + cheap_fn() + Median: 4.16 us + IQR: 0.22 us (4.04 to 4.26) + 4 measurements, 1000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("expensive_fn()").blocked_autorange(min_run_time=10), + """ + + expensive_fn() + Median: 19.97 us + IQR: 1.35 us (19.31 to 20.65) + 501 measurements, 1000 runs per measurement, 1 thread""" + ) + + assert_reprs_match( + MockTimer("expensive_fn()").adaptive_autorange(), + """ + + expensive_fn() + Median: 20.79 us + IQR: 1.09 us (20.20 to 21.29) + 4 measurements, 1000 runs per measurement, 1 thread""" + ) + + class _MockCudaTimer(self._MockTimer): + # torch.cuda.synchronize is much more expensive than + # just timeit.default_timer + _timer_cost = 10e-6 + + _function_costs = ( + self._MockTimer._function_costs[0], + self._MockTimer._function_costs[1], + + # GPU should be faster once there is enough work. + ("expensive_fn()", 5e-6), + ) + + class MockCudaTimer(benchmark_utils.Timer): + _timer_cls = _MockCudaTimer + + configurations = ( + (7.9903966e-09, 376, 1000000, MockTimer("pass")), + (7.8554826e-09, 4, 100000000, MockCudaTimer("pass")), + (3.9930536e-06, 752, 1000, MockTimer("cheap_fn()")), + (3.9441239e-06, 8, 100000, MockCudaTimer("cheap_fn()")), + (1.9994249e-05, 150, 1000, MockTimer("expensive_fn()")), + (4.9301076e-06, 6, 100000, MockCudaTimer("expensive_fn()")), ) - medium = timer.adaptive_autorange(min_run_time=0.1, max_run_time=1.0) - blocked_medium = timer.blocked_autorange(min_run_time=0.1) - self.assertLess(small.median, medium.median) - # This acts as a control to compare to a different way to measure the same value. - self.assertLess(small.median, blocked_medium.median) + + for median, repeats, number_per_run, timer_instance in configurations: + measurement = timer_instance.blocked_autorange(min_run_time=3) + self.assertEqual(measurement.median, median) + self.assertEqual(len(measurement.times), repeats) + self.assertEqual(measurement.number_per_run, number_per_run) + + @slowTest + @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.") + def test_collect_callgrind(self): + timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1") + + # Don't collect baseline to speed up unit test by ~30 seconds. + stats = timer.collect_callgrind(number=1000, collect_baseline=False) + + self.assertIsInstance(stats.counts(include_lookdict_unicode=False), int) def test_compare(self): - compare = benchmark_utils.Compare([ - benchmark_utils.Timer( - "torch.ones((n,))", globals={"n": n}, - description="ones", label=str(n)).timeit(3) - for n in range(3) - ]) - compare.print() + # Simulate several approaches. + costs = ( + # overhead_optimized_fn() + (1e-6, 1e-9), + + # compute_optimized_fn() + (3e-6, 5e-10), + + # special_case_fn() [square inputs only] + (1e-6, 4e-10), + ) + + sizes = ( + (16, 16), + (16, 128), + (128, 128), + (4096, 1024), + (2048, 2048), + ) + + # overhead_optimized_fn() + class _MockTimer_0(self._MockTimer): + _function_costs = tuple( + (f"fn({i}, {j})", costs[0][0] + costs[0][1] * i * j) + for i, j in sizes + ) + + class MockTimer_0(benchmark_utils.Timer): + _timer_cls = _MockTimer_0 + + # compute_optimized_fn() + class _MockTimer_1(self._MockTimer): + _function_costs = tuple( + (f"fn({i}, {j})", costs[1][0] + costs[1][1] * i * j) + for i, j in sizes + ) + + class MockTimer_1(benchmark_utils.Timer): + _timer_cls = _MockTimer_1 + + # special_case_fn() + class _MockTimer_2(self._MockTimer): + _function_costs = tuple( + (f"fn({i}, {j})", costs[2][0] + costs[2][1] * i * j) + for i, j in sizes if i == j + ) + + class MockTimer_2(benchmark_utils.Timer): + _timer_cls = _MockTimer_2 + + results = [] + for i, j in sizes: + results.append( + MockTimer_0( + f"fn({i}, {j})", + label="fn", + description=f"({i}, {j})", + sub_label="overhead_optimized", + ).blocked_autorange(min_run_time=10) + ) + + results.append( + MockTimer_1( + f"fn({i}, {j})", + label="fn", + description=f"({i}, {j})", + sub_label="compute_optimized", + ).blocked_autorange(min_run_time=10) + ) + + if i == j: + results.append( + MockTimer_2( + f"fn({i}, {j})", + label="fn", + description=f"({i}, {j})", + sub_label="special_case (square)", + ).blocked_autorange(min_run_time=10) + ) + + def check_output(output: str, expected: str): + # VSCode will strip trailing newlines from `expected`, so we have to match + # this behavior when comparing output. + output_str = "\n".join( + i.rstrip() for i in output.strip().splitlines(keepends=False)) + + self.assertEqual(output_str, textwrap.dedent(expected).strip()) + + compare = benchmark_utils.Compare(results) + + check_output( + str(compare), + """ + [------------------------------------------------- fn ------------------------------------------------] + | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) + 1 threads: -------------------------------------------------------------------------------------------- + overhead_optimized | 1.3 | 3.0 | 17.4 | 4174.4 | 4174.4 + compute_optimized | 3.1 | 4.0 | 11.2 | 2099.3 | 2099.3 + special_case (square) | 1.1 | | 7.5 | | 1674.7 + + Times are in microseconds (us).""" + ) + + compare.trim_significant_figures() + check_output( + str(compare), + """ + [------------------------------------------------- fn ------------------------------------------------] + | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) + 1 threads: -------------------------------------------------------------------------------------------- + overhead_optimized | 1 | 3.0 | 17 | 4200 | 4200 + compute_optimized | 3 | 4.0 | 11 | 2100 | 2100 + special_case (square) | 1 | | 8 | | 1700 + + Times are in microseconds (us).""" + ) + + compare.colorize() + check_output( + str(compare), + """ + [------------------------------------------------- fn ------------------------------------------------] + | (16, 16) | (16, 128) | (128, 128) | (4096, 1024) | (2048, 2048) + 1 threads: -------------------------------------------------------------------------------------------- + overhead_optimized | 1 | \x1b[92m\x1b[1m 3.0 \x1b[0m\x1b[0m | \x1b[2m\x1b[91m 17 \x1b[0m\x1b[0m | 4200 | \x1b[2m\x1b[91m 4200 \x1b[0m\x1b[0m + compute_optimized | \x1b[2m\x1b[91m 3 \x1b[0m\x1b[0m | 4.0 | 11 | \x1b[92m\x1b[1m 2100 \x1b[0m\x1b[0m | 2100 + special_case (square) | \x1b[92m\x1b[1m 1 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 8 \x1b[0m\x1b[0m | | \x1b[92m\x1b[1m 1700 \x1b[0m\x1b[0m + + Times are in microseconds (us).""" # noqa + ) + @unittest.skipIf(IS_WINDOWS and os.getenv("VC_YEAR") == "2019", "Random seed only accepts int32") def test_fuzzer(self): @@ -671,5 +932,13 @@ def test_fuzzer(self): x, torch.Tensor(expected_results[i]), rtol=1e-3, atol=1e-3) +class TestAssert(TestCase): + def test_assert_true(self): + # verify assertions work as expected + torch.Assert(True, "foo") + with self.assertRaisesRegex(AssertionError, "bar"): + torch.Assert(False, "bar") + + if __name__ == '__main__': run_tests() diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py index a40ec48f2f37..56c44b904b47 100644 --- a/test/test_xnnpack_integration.py +++ b/test/test_xnnpack_integration.py @@ -12,10 +12,12 @@ import io import itertools +from torch.testing._internal.common_utils import TEST_WITH_TSAN @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKOps(TestCase): @given(batch_size=st.integers(0, 3), data_shape=hu.array_shapes(1, 3, 2, 64), @@ -161,6 +163,7 @@ def test_conv2d_transpose(self, @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKSerDes(TestCase): @given(batch_size=st.integers(0, 3), data_shape=hu.array_shapes(1, 3, 2, 64), @@ -551,6 +554,7 @@ def forward(self, x): @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKRewritePass(TestCase): @staticmethod def validate_transformed_module( @@ -911,6 +915,7 @@ def forward(self, x): @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment") class TestXNNPACKConv1dTransformPass(TestCase): @staticmethod def validate_transform_conv1d_to_conv2d( diff --git a/third_party/fbgemm b/third_party/fbgemm index 1d710393d5b7..fe9164007c33 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 1d710393d5b7588f5de3b83f51c22bbddf095229 +Subproject commit fe9164007c3392a12ea51a19b0f4e9f40d24f88d diff --git a/third_party/fmt b/third_party/fmt index 9bdd1596cef1..cd4af11efc9c 160000 --- a/third_party/fmt +++ b/third_party/fmt @@ -1 +1 @@ -Subproject commit 9bdd1596cef1b57b9556f8bef32dc4a32322ef3e +Subproject commit cd4af11efc9c622896a3e4cb599fa28668ca3d05 diff --git a/third_party/foxi b/third_party/foxi index 9ca418d2f4bc..4aba696ec8f3 160000 --- a/third_party/foxi +++ b/third_party/foxi @@ -1 +1 @@ -Subproject commit 9ca418d2f4bc8e022d843388afa0fd0a14bd57dc +Subproject commit 4aba696ec8f31794fd42880346dc586486205e0a diff --git a/third_party/nccl/nccl b/third_party/nccl/nccl index 195232556936..033d799524fb 160000 --- a/third_party/nccl/nccl +++ b/third_party/nccl/nccl @@ -1 +1 @@ -Subproject commit 195232556936b39b01cc908296e1650b80d4a3e9 +Subproject commit 033d799524fb97629af5ac2f609de367472b2696 diff --git a/third_party/tensorpipe b/third_party/tensorpipe index 9646e1a43199..95ff9319161f 160000 --- a/third_party/tensorpipe +++ b/third_party/tensorpipe @@ -1 +1 @@ -Subproject commit 9646e1a431997edb1579972cef196d8fb97a77a5 +Subproject commit 95ff9319161fcdb3c674d2bb63fac3e94095b343 diff --git a/third_party/valgrind b/third_party/valgrind new file mode 160000 index 000000000000..2593ccd82c18 --- /dev/null +++ b/third_party/valgrind @@ -0,0 +1 @@ +Subproject commit 2593ccd82c189bf40b60a3a4934c5d0bbdb75427 diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index acecbe737e6d..026293a9281a 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -from __future__ import absolute_import, division, print_function + import os import argparse import sys diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 9ee296e83035..2af8ee81604e 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -159,7 +159,7 @@ # NB: The parameter names here MUST be consistent with the parameter names # in Decalarations.yaml - name: abs(Tensor self) -> Tensor - self: grad * self.sign() + self: grad * self.sgn() - name: acos(Tensor self) -> Tensor self: grad * -((-self * self + 1).rsqrt()) @@ -397,19 +397,19 @@ # of the higher order derivatives, see https://github.com/pytorch/pytorch/issues/43414 # Note that we don't use "result" because saving it would be BC-breaking when it is used in an inplace operation later - name: div.Tensor(Tensor self, Tensor other) -> Tensor - self: grad / other - other: -grad * (self / other) / other + self: div_tensor_self_backward(grad, other, self.scalar_type()) + other: div_tensor_other_backward(grad, self, other) - name: div.Scalar(Tensor self, Scalar other) -> Tensor - self: grad / other + self: div_tensor_self_backward(grad, at::scalar_to_tensor(other), self.scalar_type()) - name: dot(Tensor self, Tensor tensor) -> Tensor - self: grad * tensor - tensor: grad * self + self: handle_r_to_c(self.scalar_type(), grad * tensor.conj()) + tensor: handle_r_to_c(tensor.scalar_type(), grad * self.conj()) - name: vdot(Tensor self, Tensor other) -> Tensor - self: 'not_implemented("vdot: self")' - other: 'not_implemented("vdot: other")' + self: handle_r_to_c(self.scalar_type(), grad.conj() * other) + other: handle_r_to_c(other.scalar_type(), grad * self) - name: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) self: _fused_dropout_backward(grad, result1, p) @@ -749,6 +749,9 @@ - name: mvlgamma(Tensor self, int p) -> Tensor self: mvlgamma_backward(grad, self, p) +- name: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor + self: grad * at::isfinite(self) + - name: native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor) input, weight, bias: "grad.defined() ? native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, eps, grad_input_mask) : std::tuple()" @@ -887,7 +890,7 @@ self: renorm_backward(grad, self, p, dim, maxnorm) - name: repeat(Tensor self, int[] repeats) -> Tensor - self: repeat_backward(grad, self.dim(), repeats) + self: repeat_backward(grad, repeats, self.sizes()) # DO NOT define a backward for reshape! # reshape is special in that it sometimes returns a view, and sometimes not. @@ -928,6 +931,9 @@ - name: sign(Tensor self) -> Tensor self: zeros_like(grad) +- name: sgn(Tensor self) -> Tensor + self: sgn_backward(result, grad, self) + - name: sin(Tensor self) -> Tensor self: grad * self.cos().conj() @@ -1218,9 +1224,9 @@ self: nll_loss2d_backward(grad, self, target, weight, reduction, ignore_index, total_weight) target: non_differentiable -- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - self: smooth_l1_loss_backward(grad, self, target, reduction) - target: smooth_l1_loss_backward(grad, target, self, reduction) +- name: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor + self: smooth_l1_loss_backward(grad, self, target, reduction, beta) + target: smooth_l1_loss_backward(grad, target, self, reduction, beta) - name: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor self: soft_margin_loss_backward(grad, self, target, reduction) @@ -1586,10 +1592,10 @@ grad_output: replication_pad3d(grad, padding) self: zeros_like(self) -- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction) - self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction) - target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction) +- name: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor + grad_output: smooth_l1_loss_double_backward_grad_output(grad, grad_output, self, target, reduction, beta) + self: smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta) + target: -smooth_l1_loss_double_backward(grad * grad_output, self, target, reduction, beta) - name: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor grad_output: softplus_backward(grad, self, beta, threshold, output) diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py index 82d908de6180..c12e9b2003d8 100644 --- a/tools/autograd/gen_autograd.py +++ b/tools/autograd/gen_autograd.py @@ -115,20 +115,6 @@ def has_tensoroptions_argument(declaration): return True return False -def process_schema_order_arg(schema_order_arg): - if schema_order_arg == 'dtype': - return 'optTypeMetaToScalarType(options.dtype_opt())' - elif schema_order_arg == 'layout': - return 'options.layout_opt()' - elif schema_order_arg == 'device': - return 'options.device_opt()' - elif schema_order_arg == 'pin_memory': - return 'options.pinned_memory_opt()' - elif schema_order_arg == 'memory_format': - return 'c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)' - else: - return schema_order_arg - def load_aten_declarations(path): with open(path, 'r') as f: @@ -142,6 +128,8 @@ def load_aten_declarations(path): for arg in declaration['arguments']: arg['simple_type'] = get_simple_type(arg) + for arg in declaration['schema_order_arguments']: + arg['simple_type'] = get_simple_type(arg) for ret in declaration['returns']: ret['simple_type'] = get_simple_type(ret) @@ -151,8 +139,6 @@ def load_aten_declarations(path): for arg in declaration['schema_order_arguments']] declaration['args'] = [arg['name'] for arg in declaration['arguments']] declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']] - if has_tensoroptions_argument(declaration): - declaration['schema_order_args'] = [process_schema_order_arg(arg) for arg in declaration['schema_order_args']] declaration['api_name'] = declaration['name'] if declaration.get('overload_name'): declaration['type_wrapper_name'] = "{}_{}".format( diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index b7fa4a3a8308..eb5de6f75ef5 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -281,6 +281,7 @@ def create_python_bindings(python_functions, is_python_method, module): 'c10::optional': 'toBoolOptional', 'c10::optional': 'toDoubleOptional', 'c10::optional>': 'doublelistOptional', + 'ArrayRef': 'doublelist', 'IntArrayRef': 'intlist', 'Scalar': 'scalar', 'ScalarType': 'scalartype', diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 804da9193a50..6e0dc0721aed 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -22,7 +22,7 @@ # which will in turn dispatch back to VariableType for its # differentiable subcomponents. # -from __future__ import print_function + from .utils import CodeTemplate, nested_dict, write, uninplace_api_name from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \ MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT @@ -71,12 +71,18 @@ # arguments (inside of the `native_functions.yaml`) RENAME_TRACE_ADD_ARGS = { 'fill': '''\ - jit::tracer::addInputs(node, "options", TensorOptions()); + jit::tracer::addInputs(node, "options", c10::optional()); + jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt)); + jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt)); + jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt)); c10::optional memory_format = c10::MemoryFormat::Preserve; jit::tracer::addInputs(node, "memory_format", memory_format); ''', 'zero': '''\ - jit::tracer::addInputs(node, "options", TensorOptions()); + jit::tracer::addInputs(node, "options", c10::optional()); + jit::tracer::addInputs(node, "options", layout_or_default(c10::nullopt)); + jit::tracer::addInputs(node, "options", device_or_default(c10::nullopt)); + jit::tracer::addInputs(node, "options", pinned_memory_or_default(c10::nullopt)); c10::optional memory_format = c10::MemoryFormat::Preserve; jit::tracer::addInputs(node, "memory_format", memory_format); ''', @@ -139,7 +145,24 @@ 'quantize_per_tensor', 'quantize_per_channel', # Functions that return integers should not have output that require gradients 'argmax', 'argmin', 'argsort', 'searchsorted', - 'bucketize' + 'bucketize', + # Functions that return booleans are not differentiable + 'isnan', 'isposinf', 'isneginf', 'isinf' +} + +# The C -> R functions at the time of adding this are still being audited and tested +# but will not error out. +# C -> C, R -> C functions for which backward is correctly implemented and tested +GRADIENT_IMPLEMENTED_FOR_COMPLEX = { + 't', 'view', 'reshape', 'reshape_as', 'view_as', 'roll', 'clone', + 'repeat', 'expand', 'flip', 'fliplr', 'flipud', 'rot90', 'transpose', + 'permute', 'squeeze', 'unsqueeze', 'resize', 'resize_as', 'tril', 'triu', + 'chunk', 'split', 'split_with_sizes', 'repeat', 'expand', 'zero_', 'eq_', + 'ne_', 'add', '__radd__', 'sum', '_conj', 'sin', 'cos', 'mul', 'sinh', + 'cosh', '__rmul__', 'sgn', 'asin', 'acos', 'sub', 'div', 'cat', 'view_as_complex', + 'neg', 'complex', 'select', '_s_where', 'as_strided', 'slice', 'constant_pad_nd', + 'unbind', 'split', 'split_with_sizes', 'unsafe_split', 'split_with_sizes_backward', + 'dot', 'vdot', 'cholesky' } # Some operators invalidate the grad_accumulator. Let's reset it. @@ -232,14 +255,14 @@ WRAPPER_REGISTRATION = CodeTemplate("""\ m.impl("${unqual_operator_name_with_overload}", - c10::impl::hacky_wrapper_for_legacy_signatures<${schema_order_cpp_signature}>(TORCH_FN(${class_type}::${type_wrapper_name})) + TORCH_FN(${class_type}::${type_wrapper_name}) ); """) UNPACK_TENSOR = CodeTemplate("""\ auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""") -UNPACK_OPTIONS = CodeTemplate("""\ +LEGACY_WRAP_OPTIONS = CodeTemplate("""\ auto ${arg_name}_ = TensorOptions(${arg_name});""") DECLARE_GRAD_FN = CodeTemplate("""\ @@ -370,7 +393,8 @@ # Generate a file that lists all functions and their schema string. Used for XLA REGISTRATION_DECLARATION = CodeTemplate("""\ -${return_type} ${api_name}(${declaration_formals}); // {"schema": "${schema_string}", "compound": "${compound}"} +${return_type} ${api_name}(${declaration_formals}); \ +// {"schema": "${schema_string}", "compound": "${compound}", "has_math_kernel": "${has_math_kernel}"} """) # TraceType templates @@ -490,15 +514,28 @@ def format_trace_op_name(declaration): def format_trace_inputs(declaration): + gather_tensor_options = "TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)" + def dispatch_trace_input(arg_spec): name, value, simple_type, nullable = arg_spec # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs if simple_type == 'TensorList' and nullable: return '''jit::tracer::addInputs(node, "{}", {}, {});'''.format(name, value, "true") else: - return ADD_TRACE_INPUT.substitute(name=name, input=value) + if value == "options": + result = "" + result += ADD_TRACE_INPUT.substitute(name=name, input="optTypeMetaToScalarType(options.dtype_opt())") + "\n" + result += ADD_TRACE_INPUT.substitute(name=name, input="options.layout()") + "\n" + result += ADD_TRACE_INPUT.substitute(name=name, input="options.device()") + "\n" + result += ADD_TRACE_INPUT.substitute(name=name, input="options.pinned_memory()") + return result + else: + return ADD_TRACE_INPUT.substitute(name=name, input=value) - trace_inputs = declaration['arguments'] + if declaration['use_c10_dispatcher'] == 'full': + trace_inputs = declaration['schema_order_arguments'] + else: + trace_inputs = declaration['arguments'] if is_out_overload(declaration): # *_out functions take the result as a first argument, but they are the @@ -506,7 +543,10 @@ def dispatch_trace_input(arg_spec): out_input = trace_inputs[0] trace_inputs = trace_inputs[1:] - trace_input_spec = [(i['name'], i['name'], i['simple_type'], i.get('is_nullable')) for i in trace_inputs] + if declaration['use_c10_dispatcher'] == 'full': + trace_input_spec = [(i['name'], i['name'], i['type'], i.get('is_nullable')) for i in trace_inputs] + else: + trace_input_spec = [(i['name'], i['name'], i['simple_type'], i.get('is_nullable')) for i in trace_inputs] trace_inputs = \ '\n'.join(dispatch_trace_input(arg_spec) for arg_spec in trace_input_spec) @@ -514,7 +554,8 @@ def dispatch_trace_input(arg_spec): if is_out_overload(declaration): # for *_out functions, handle the result argument differently for inplace/outplace. # For inplace: just add the input to the end to confirm with the JIT schema - inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=out_input['name']) + value = out_input['name'] + inplace = ADD_TRACE_INPUT.substitute(name=out_input['name'], input=value) # for outplace: do nothing, except if the declaration is a factory. # Factories are a bit special because their out-of-place overloads @@ -522,7 +563,11 @@ def dispatch_trace_input(arg_spec): trace_name = uninplace_api_name(declaration['api_name']) has_factory_name = trace_name in FACTORY_FUNCTION_NAMES if has_factory_name: - outplace = ADD_TRACE_INPUT.substitute(name='out', input='out.options()') + outplace = "" + outplace += ADD_TRACE_INPUT.substitute(name='out', input='optTypeMetaToScalarType(out.options().dtype_opt())') + "\n" + outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().layout()') + "\n" + outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().device()') + "\n" + outplace += ADD_TRACE_INPUT.substitute(name='out', input='out.options().pinned_memory()') else: outplace = '' @@ -654,12 +699,12 @@ def gen_variable_type(out, aten_declarations, template_path): registration_declarations.append( REGISTRATION_DECLARATION.substitute(declaration, declaration_formals=declaration_formals, - compound='false')) + compound='False')) else: registration_declarations.append( REGISTRATION_DECLARATION.substitute(declaration, declaration_formals=declaration_formals, - compound='true')) + compound='True')) env = { 'registration_declarations': registration_declarations, @@ -680,12 +725,17 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade for declaration in aten_declarations: formal_types = [arg['type'] for arg in declaration['arguments']] - type_declarations.append(METHOD_DECLARATION.substitute(declaration)) + if declaration['use_c10_dispatcher'] == 'full': + formals = declaration['schema_order_formals'] + else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + formals = declaration['formals'] + type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals)) strategy = dispatch_strategy(declaration) if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived': body = emit_body(declaration) type_definitions.append(METHOD_DEFINITION.substitute( - declaration, type_definition_body=body)) + declaration, type_definition_body=body, formals=formals)) if declaration['use_c10_dispatcher'] == 'full': wrapper_registrations.append(WRAPPER_REGISTRATION.substitute( declaration, class_type='VariableType')) @@ -701,7 +751,7 @@ def gen_variable_type_shard(out, aten_declarations, template_path, suffix, heade if declaration['name'] not in MANUAL_TRACER: trace_body = emit_trace_body(declaration) trace_method_definitions.append(METHOD_DEFINITION.substitute( - declaration, type_definition_body=trace_body)) + declaration, type_definition_body=trace_body, formals=formals)) if declaration['use_c10_dispatcher'] == 'full': trace_wrapper_registrations.append(WRAPPER_REGISTRATION.substitute( @@ -925,6 +975,16 @@ def setup_derivative(differentiable_inputs): body.append(SETUP_DERIVATIVE.substitute(env, setup=setup)) return body + def emit_check_if_in_complex_autograd_allowlist(): + body = [] + if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX: + return body + for arg in differentiable_outputs: + name = arg['name'] + if arg['type'] == 'Tensor' or arg['type'] == 'TensorList': + body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name)) + return body + def emit_check_no_requires_grad(tensor_args, args_with_derivatives): """Checks that arguments without derivatives don't require grad""" body = [] @@ -1182,6 +1242,7 @@ def emit_increment_version(): body.append(emit_history()) if requires_derivative: body.append(emit_save_outputs()) + body.extend(emit_check_if_in_complex_autograd_allowlist()) if base_name in RESET_GRAD_ACCUMULATOR: # `inplace` implies that there is exactly one output named `self`, # so we can keep the generated code easy. If you need to @@ -1201,7 +1262,12 @@ def requires_unpack(arg): body = [] unpacked_args = [] unpacked_args_simple_type = {} - for i, arg in enumerate(declaration['arguments']): + if declaration['use_c10_dispatcher'] == 'full': + arguments = declaration['schema_order_arguments'] + else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + arguments = declaration['arguments'] + for i, arg in enumerate(arguments): if not requires_unpack(arg): unpacked_args.append(arg['name']) unpacked_args_simple_type[arg['name']] = arg['simple_type'] @@ -1223,7 +1289,9 @@ def requires_unpack(arg): # Okay, we are abusing the definition of 'unpack' here a bit, # although it's still getting the non-variable from the variable # (in this case via TensorOptions rather than Variable/Tensor). - body.append(UNPACK_OPTIONS.substitute(arg_name=arg['name'])) + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \ + "VariableKernel shouldn't take TensorOptions if the op is c10-full" + body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name'])) unpacked_args.append(arg['name'] + '_') unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type'] diff --git a/tools/autograd/templates/VariableType.cpp b/tools/autograd/templates/VariableType.cpp index efddffbe7610..079427cd97dc 100644 --- a/tools/autograd/templates/VariableType.cpp +++ b/tools/autograd/templates/VariableType.cpp @@ -3,7 +3,6 @@ #include #include -#include // ${generated_comment} diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp index 7d0186538c98..1dbdca565792 100644 --- a/tools/autograd/templates/python_fft_functions.cpp +++ b/tools/autograd/templates/python_fft_functions.cpp @@ -7,14 +7,27 @@ #include "torch/csrc/autograd/python_variable.h" #include "torch/csrc/autograd/utils/wrap_outputs.h" #include "torch/csrc/autograd/utils/python_arg_parsing.h" +#include "torch/csrc/autograd/generated/variable_factories.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/structseq.h" +#include "torch/csrc/utils/cuda_lazy_init.h" + +#include using at::Tensor; +using at::Device; +using at::Layout; using at::Scalar; -using at::MemoryFormat; -using at::Generator; +using at::ScalarType; +using at::Backend; +using at::OptionalDeviceGuard; +using at::DeviceGuard; +using at::TensorOptions; using at::IntArrayRef; +using at::Generator; +using at::TensorList; +using at::Dimname; +using at::DimnameList; using namespace torch::autograd::utils; diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index 62e9b8dd227f..aac41111e1bf 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -44,6 +44,7 @@ using at::Generator; using at::TensorList; using at::Dimname; using at::DimnameList; +using at::ArrayRef; using namespace torch::autograd::utils; @@ -582,29 +583,29 @@ static PyObject * THPVariable_nonzero(PyObject* self, PyObject* args, PyObject* { HANDLE_TH_ERRORS static PythonArgParser parser({ - "nonzero(Tensor input, *, Tensor out=None)|deprecated", - "nonzero(Tensor input, *, bool as_tuple)", + "nonzero(Tensor input, *, bool as_tuple=False, Tensor out=None)", }); - ParsedArgs<2> parsed_args; + ParsedArgs<3> parsed_args; auto r = parser.parse(args, kwargs, parsed_args); if(r.has_torch_function()){ return handle_torch_function(r, args, kwargs, THPVariableFunctionsModule, "torch"); } - if (r.idx == 0) { - if (r.isNone(1)) { - return wrap(dispatch_nonzero(r.tensor(0))); - } else { - return wrap(dispatch_nonzero(r.tensor(0), r.tensor(1))); - } - } else { - if (r.toBool(1)) { - return wrap(dispatch_nonzero_numpy(r.tensor(0))); - } else { - return wrap(dispatch_nonzero(r.tensor(0))); - } + const auto as_tuple = r.toBool(1); + const auto has_out = !r.isNone(2); + + if (as_tuple) { + TORCH_CHECK(!has_out, "nonzero does not support the out kwarg when as_tuple is True"); + return wrap(dispatch_nonzero_numpy(r.tensor(0))); + } + + if (has_out) { + return wrap(dispatch_nonzero(r.tensor(0), r.tensor(2))); } + + return wrap(dispatch_nonzero(r.tensor(0))); + END_HANDLE_TH_ERRORS } diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 3cc3585aa555..96301611c2e5 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -91,11 +91,7 @@ core_sources_common = [ "torch/csrc/jit/serialization/unpickler.cpp", ] -jit_sources_common = [ - "torch/csrc/jit/runtime/register_prim_ops_c10.cpp", -] - -libtorch_sources_common = core_sources_common + jit_sources_common +libtorch_sources_common = core_sources_common core_trainer_sources = [ "torch/csrc/autograd/anomaly_mode.cpp", @@ -152,6 +148,7 @@ core_sources_full = [ "torch/csrc/jit/ir/scope.cpp", "torch/csrc/jit/ir/subgraph_matcher.cpp", "torch/csrc/jit/jit_log.cpp", + "torch/csrc/jit/passes/annotate_warns.cpp", "torch/csrc/jit/passes/bailout_graph.cpp", "torch/csrc/jit/passes/batch_mm.cpp", "torch/csrc/jit/passes/canonicalize.cpp", @@ -223,6 +220,7 @@ core_sources_full = [ "torch/csrc/jit/runtime/profiling_record.cpp", "torch/csrc/jit/runtime/symbolic_script.cpp", "torch/csrc/jit/runtime/static/impl.cpp", + "torch/csrc/jit/runtime/static/ops.cpp", "torch/csrc/jit/serialization/import.cpp", "torch/csrc/jit/serialization/import_export_helpers.cpp", "torch/csrc/jit/serialization/import_source.cpp", @@ -233,7 +231,6 @@ core_sources_full = [ "torch/csrc/jit/tensorexpr/codegen.cpp", "torch/csrc/jit/tensorexpr/eval.cpp", "torch/csrc/jit/tensorexpr/expr.cpp", - "torch/csrc/jit/tensorexpr/function.cpp", "torch/csrc/jit/tensorexpr/hash_provider.cpp", "torch/csrc/jit/tensorexpr/ir.cpp", "torch/csrc/jit/tensorexpr/ir_mutator.cpp", @@ -302,12 +299,11 @@ jit_sources_full = [ "torch/csrc/jit/runtime/register_prim_ops.cpp", "torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp", "torch/csrc/jit/runtime/register_special_ops.cpp", - "torch/csrc/jit/runtime/register_string_ops.cpp", "torch/csrc/jit/passes/remove_inplace_ops.cpp", "torch/csrc/jit/passes/utils/check_alias_annotation.cpp", ] -libtorch_core_jit_sources = sorted(jit_sources_common + jit_sources_full) +libtorch_core_jit_sources = sorted(jit_sources_full) libtorch_cmake_sources = libtorch_core_sources + libtorch_core_jit_sources @@ -344,6 +340,7 @@ libtorch_cuda_sources = [ "torch/csrc/autograd/functions/comm.cpp", "torch/csrc/jit/codegen/cuda/arith.cpp", "torch/csrc/jit/codegen/cuda/compute_at.cpp", + "torch/csrc/jit/codegen/cuda/codegen.cpp", "torch/csrc/jit/codegen/cuda/dispatch.cpp", "torch/csrc/jit/codegen/cuda/expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/executor.cpp", @@ -353,6 +350,7 @@ libtorch_cuda_sources = [ "torch/csrc/jit/codegen/cuda/fusion.cpp", "torch/csrc/jit/codegen/cuda/graph_fuser.cpp", "torch/csrc/jit/codegen/cuda/index_compute.cpp", + "torch/csrc/jit/codegen/cuda/instrumentation.cpp", "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp", "torch/csrc/jit/codegen/cuda/ir_cloner.cpp", "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp", @@ -362,8 +360,10 @@ libtorch_cuda_sources = [ "torch/csrc/jit/codegen/cuda/kernel.cpp", "torch/csrc/jit/codegen/cuda/kernel_cache.cpp", "torch/csrc/jit/codegen/cuda/kernel_ir.cpp", + "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp", "torch/csrc/jit/codegen/cuda/lower_index.cpp", "torch/csrc/jit/codegen/cuda/lower_loops.cpp", + "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp", "torch/csrc/jit/codegen/cuda/lower_unroll.cpp", "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp", "torch/csrc/jit/codegen/cuda/lower_utils.cpp", @@ -542,11 +542,14 @@ libtorch_python_core_sources = [ "torch/csrc/utils/disable_torch_function.cpp", ] -libtorch_python_distributed_sources = [ - "torch/csrc/distributed/autograd/init.cpp", +libtorch_python_distributed_core_sources = [ "torch/csrc/distributed/c10d/comm.cpp", "torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/reducer.cpp", +] + +libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ + "torch/csrc/distributed/autograd/init.cpp", "torch/csrc/distributed/rpc/init.cpp", "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/py_rref.cpp", @@ -575,48 +578,4 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"): _libtorch_python_sources.extend(libtorch_python_core_sources) _libtorch_python_sources.extend(libtorch_python_distributed_sources) - _libtorch_python_sources.extend([ - "test/cpp/jit/torch_python_test.cpp", - "test/cpp/tensorexpr/padded_buffer.cpp", - "test/cpp/jit/test_alias_analysis.cpp", - "test/cpp/jit/test_argument_spec.cpp", - "test/cpp/jit/test_autodiff.cpp", - "test/cpp/jit/test_backend.cpp", - "test/cpp/jit/test_base.cpp", - "test/cpp/jit/test_class_import.cpp", - "test/cpp/jit/test_class_parser.cpp", - "test/cpp/jit/test_class_type.cpp", - "test/cpp/jit/test_code_template.cpp", - "test/cpp/jit/test_constant_pooling.cpp", - "test/cpp/jit/test_cleanup_passes.cpp", - "test/cpp/jit/test_create_autodiff_subgraphs.cpp", - "test/cpp/jit/test_custom_class.cpp", - "test/cpp/jit/test_custom_operators.cpp", - "test/cpp/jit/test_dce.cpp", - "test/cpp/jit/test_fuser.cpp", - "test/cpp/jit/test_gpu.cpp", - "test/cpp/jit/test_graph_executor.cpp", - "test/cpp/jit/test_inliner.cpp", - "test/cpp/jit/test_interface.cpp", - "test/cpp/jit/test_interpreter.cpp", - "test/cpp/jit/test_ir.cpp", - "test/cpp/jit/test_irparser.cpp", - "test/cpp/jit/test_jit_type.cpp", - "test/cpp/jit/test_lite_interpreter.cpp", - "test/cpp/jit/test_lite_trainer.cpp", - "test/cpp/jit/test_misc.cpp", - "test/cpp/jit/test_mobile_type_parser.cpp", - "test/cpp/jit/test_module_api.cpp", - "test/cpp/jit/test_peephole_optimize.cpp", - "test/cpp/jit/test_qualified_name.cpp", - "test/cpp/jit/test_save_load.cpp", - "test/cpp/jit/test_schema_matching.cpp", - "test/cpp/jit/test_subgraph_matcher.cpp", - "test/cpp/jit/test_subgraph_rewriter.cpp", - "test/cpp/jit/test_subgraph_utils.cpp", - "test/cpp/jit/test_utils.cpp", - ]) - - _libtorch_python_sources.extend(native.glob(["test/cpp/tensorexpr/test_*.cpp"])) - return _libtorch_python_sources diff --git a/tools/clang_format_all.py b/tools/clang_format_all.py index 710a21e33514..77ca68d92b0b 100755 --- a/tools/clang_format_all.py +++ b/tools/clang_format_all.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -A script that runs clang-format on all C/C++ files in CLANG_FORMAT_WHITELIST. There is +A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is also a diff mode which simply checks if clang-format would make any changes, which is useful for CI purposes. @@ -14,22 +14,22 @@ import sys from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH -# Whitelist of directories to check. All files that in that directory +# Allowlist of directories to check. All files that in that directory # (recursively) will be checked. -# If you edit this, please edit the whitelist in clang_format_ci.sh as well. -CLANG_FORMAT_WHITELIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"] +# If you edit this, please edit the allowlist in clang_format_ci.sh as well. +CLANG_FORMAT_ALLOWLIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"] # Only files with names matching this regex will be formatted. CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$") -def get_whitelisted_files(): +def get_allowlisted_files(): """ - Parse CLANG_FORMAT_WHITELIST and resolve all directories. - Returns the set of whitelist cpp source files. + Parse CLANG_FORMAT_ALLOWLIST and resolve all directories. + Returns the set of allowlist cpp source files. """ matches = [] - for dir in CLANG_FORMAT_WHITELIST: + for dir in CLANG_FORMAT_ALLOWLIST: for root, dirnames, filenames in os.walk(dir): for filename in filenames: if CPP_FILE_REGEX.match(filename): @@ -77,7 +77,7 @@ async def file_clang_formatted_correctly(filename, semaphore, verbose=False): async def run_clang_format(max_processes, diff=False, verbose=False): """ - Run clang-format to all files in CLANG_FORMAT_WHITELIST that match CPP_FILE_REGEX. + Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX. """ # Check to make sure the clang-format binary exists. if not os.path.exists(CLANG_FORMAT_PATH): @@ -97,7 +97,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False): # Format files in parallel. if diff: - for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_whitelisted_files()]): + for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]): ok &= await f if ok: @@ -105,7 +105,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False): else: print("Some files not formatted correctly") else: - await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_whitelisted_files()]) + await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()]) return ok @@ -134,7 +134,7 @@ def main(args): options = parse_args(args) # Get clang-format and make sure it is the right binary and it is in the right place. ok = get_and_check_clang_format(options.verbose) - # Invoke clang-format on all files in the directories in the whitelist. + # Invoke clang-format on all files in the directories in the allowlist. if ok: loop = asyncio.get_event_loop() ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose)) diff --git a/tools/clang_tidy.py b/tools/clang_tidy.py index 354aedc601ad..f8e8e61857e5 100755 --- a/tools/clang_tidy.py +++ b/tools/clang_tidy.py @@ -12,7 +12,7 @@ glob or regular expressions. """ -from __future__ import print_function + import argparse import collections diff --git a/tools/code_analyzer/gen_op_registration_whitelist.py b/tools/code_analyzer/gen_op_registration_allowlist.py similarity index 94% rename from tools/code_analyzer/gen_op_registration_whitelist.py rename to tools/code_analyzer/gen_op_registration_allowlist.py index 5971864b2187..56e0f78cc1b5 100644 --- a/tools/code_analyzer/gen_op_registration_whitelist.py +++ b/tools/code_analyzer/gen_op_registration_allowlist.py @@ -1,11 +1,11 @@ """ -This util is invoked from cmake to produce the op registration whitelist param +This util is invoked from cmake to produce the op registration allowlist param for `ATen/gen.py` for custom mobile build. For custom build with dynamic dispatch, it takes the op dependency graph of ATen and the list of root ops, and outputs all transitive dependencies of the root -ops as the whitelist. +ops as the allowlist. For custom build with static dispatch, the op dependency graph will be omitted, -and it will directly output root ops as the whitelist. +and it will directly output root ops as the allowlist. """ import argparse diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index 452c3721ab92..538ba3596c7d 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -1,7 +1,9 @@ from tools.codegen.model import * from tools.codegen.api.types import TensorOptionsArguments, CppArgument, ThisArgument import tools.codegen.local as local -from typing import Optional, Sequence, Union, Callable, List +from typing import Optional, Sequence, Union, Callable, List, Tuple +import copy +from dataclasses import dataclass # This file describes the translation of JIT schema to the public C++ # API, which is what people use when they call functions like at::add. @@ -71,9 +73,6 @@ def argumenttype_type(t: Type, *, mutable: bool) -> str: if r is not None: return r - if str(t) == 'Tensor' and mutable and local.hack_const_mutable_self(): - return 'const Tensor &' - if isinstance(t, BaseType): if t.name == BaseTy.Tensor: if mutable: @@ -155,6 +154,7 @@ def returns_type(rs: Sequence[Return]) -> str: '[]': '{}', '[0,1]': '{0,1}', # TODO: stop special casing 'contiguous_format': 'MemoryFormat::Contiguous', + 'long': 'at::kLong', } # Convert a JIT default into C++ expression representing the default @@ -194,9 +194,50 @@ def argument(a: Union[Argument, TensorOptionsArguments, ThisArgument]) -> CppArg else: assert_never(a) -def group_arguments( - func: FunctionSchema, *, method: bool = False -) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]: +@dataclass(frozen=True) +class CppSignature: + returns: Tuple[Return, ...] + arguments: Tuple[Union[Argument, TensorOptionsArguments, ThisArgument], ...] + + def cpp_arguments(self) -> Sequence[CppArgument]: + return list(map(argument, self.arguments)) + + # Return arguments as a comma separated list, i.e. like they would be in a C++ + # function signature. Include default values for arguments. + def cpp_arguments_str(self, with_defaults: bool) -> str: + args_without_this = [argument(a) for a in self.arguments if not isinstance(a, ThisArgument)] + if with_defaults: + return ', '.join(map(str, args_without_this)) + else: + return ', '.join(map(lambda s: s.str_no_default(), args_without_this)) + + +@dataclass(frozen=True) +class CppSignatureGroup: + # arguments contains the arguments for the C++ signature as it is represented + # in the JIT schema. + signature: CppSignature + + # gathered_signature is an alternative C++ signature in which TensorOptions are + # gathered into one TensorOptions object instead of being scattered into + # ScalarType, Layout, Device. This is only present for factory operators, + # other operators have this set to None. This can be used to generate a + # convenience API in the C++ frontend so users can call using TensorOptions objects. + gathered_signature: Optional[CppSignature] + + # If it is a factory op, this returns the arguments for the convenience API + # that takes TensorOptions. If it is not a factory op and doesn't have + # a gathered signature, then this returns the regular signature instead. + def signature_prefer_gathered(self) -> CppSignature: + if self.gathered_signature is not None: + return self.gathered_signature + else: + return self.signature + + +def signature_group( + func: FunctionSchema, *, method: bool = False, +) -> CppSignatureGroup: args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = [] args.extend(func.out_arguments) @@ -205,8 +246,9 @@ def group_arguments( else: args.extend(func.arguments) - # group up arguments for tensor options + gathered_args = copy.deepcopy(args) + # group up arguments for tensor options def pred(name: str, ty: Type) -> Callable[[Argument], bool]: return lambda a: a.name == name and a.type in [ty, OptionalType(ty)] predicates = [ # order matters @@ -216,14 +258,16 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]: pred('pin_memory', Type.parse('bool')), ] + has_tensoroptions_argument = False i = 0 while i < len(func.kwarg_only_arguments): # If there is enough space... if i <= len(func.kwarg_only_arguments) - len(predicates): # And the next len(predicates) arguments look like TensorOptions arguments if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])): + has_tensoroptions_argument = True # Group them together as one argument - args.append(TensorOptionsArguments( + gathered_args.append(TensorOptionsArguments( dtype=func.kwarg_only_arguments[i], layout=func.kwarg_only_arguments[i + 1], device=func.kwarg_only_arguments[i + 2], @@ -231,11 +275,19 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]: )) i += len(predicates) continue - args.append(func.kwarg_only_arguments[i]) + gathered_args.append(func.kwarg_only_arguments[i]) i += 1 - return args + args.extend(func.kwarg_only_arguments) -# Convert arguments to C++ API form -def arguments(func: FunctionSchema, *, method: bool = False) -> Sequence[CppArgument]: - return list(map(argument, group_arguments(func, method=method))) + if has_tensoroptions_argument: + return CppSignatureGroup( + signature=CppSignature(arguments=tuple(args), returns=tuple(func.returns)), + gathered_signature=CppSignature(arguments=tuple(gathered_args), returns=tuple(func.returns)), + ) + else: + assert gathered_args == args + return CppSignatureGroup( + signature=CppSignature(arguments=tuple(args), returns=tuple(func.returns)), + gathered_signature=None, + ) diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index 34960534275f..6cb141c22f99 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -2,10 +2,10 @@ from tools.codegen.api.types import CppArgument, DispatcherExpr, TensorOptionsArguments, \ DispatcherArgument, ThisArgument, LegacyDispatcherArgument -import tools.codegen.api.cpp as cpp +from tools.codegen.api import cpp import tools.codegen.api.legacy_dispatcher as legacy_dispatcher import tools.codegen.local as local - +from enum import Enum import itertools from typing import Sequence, Optional @@ -63,6 +63,9 @@ def argument(a: Argument) -> DispatcherArgument: argument=la.argument, ) +def name(func: FunctionSchema) -> str: + return cpp.name(func) + def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]: if local.use_c10_dispatcher() is UseC10Dispatcher.full: return list(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments))) @@ -72,11 +75,19 @@ def arguments(func: FunctionSchema) -> Sequence[DispatcherArgument]: for la in legacy_dispatcher.arguments(func) ] +# TODO GATHER is only needed for non-c10-full ops, remove later. +ProcessTensoroptions = Enum('ProcessTensoroptions', ('GATHER', 'SCATTER', 'PASS_THROUGH')) + + # Given a set of CppArguments in scope, return a sequence of dispatcher # expressions that translate the cpp API into dispatcher API -def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) -> Sequence[DispatcherExpr]: +def cppargument_exprs(a: CppArgument, + *, + tensor_options: Optional[CppArgument], + process_tensoroptions: ProcessTensoroptions = ProcessTensoroptions.PASS_THROUGH + ) -> Sequence[DispatcherExpr]: if isinstance(a.argument, TensorOptionsArguments): - if local.use_c10_dispatcher() is UseC10Dispatcher.full: + if process_tensoroptions == ProcessTensoroptions.SCATTER: ta = a.argument return [ DispatcherExpr(type=argument_type(ta.dtype), expr=f'optTypeMetaToScalarType({a.name}.dtype_opt())'), @@ -84,8 +95,16 @@ def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) DispatcherExpr(type=argument_type(ta.device), expr=f'{a.name}.device_opt()'), DispatcherExpr(type=argument_type(ta.pin_memory), expr=f'{a.name}.pinned_memory_opt()'), # weird discrep ] + elif process_tensoroptions == ProcessTensoroptions.GATHER: + return [ + DispatcherExpr( + type='const TensorOptions &', + expr="TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory)")] else: + assert process_tensoroptions == ProcessTensoroptions.PASS_THROUGH return [DispatcherExpr(type='const TensorOptions &', expr=a.name)] + elif isinstance(a.argument, ThisArgument): + return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)] elif isinstance(a.argument, Argument): if a.name == 'memory_format' and tensor_options is not None and local.use_c10_dispatcher() is UseC10Dispatcher.full: return [DispatcherExpr( @@ -94,16 +113,35 @@ def cppargument_exprs(a: CppArgument, *, tensor_options: Optional[CppArgument]) ] else: return [DispatcherExpr(type=argument_type(a.argument), expr=a.name)] - elif isinstance(a.argument, ThisArgument): - return [DispatcherExpr(type=argument_type(a.argument.argument), expr=a.name)] else: assert_never(a.argument) -def cpparguments_exprs(args: Sequence[CppArgument]) -> Sequence[DispatcherExpr]: +def cpparguments_exprs(args: Sequence[CppArgument], process_tensoroptions: ProcessTensoroptions) -> Sequence[DispatcherExpr]: tensor_options = next((a for a in args if isinstance(a.argument, TensorOptionsArguments)), None) - return [r for a in args for r in cppargument_exprs(a, tensor_options=tensor_options)] + return [r for a in args for r in cppargument_exprs(a, + tensor_options=tensor_options, + process_tensoroptions=process_tensoroptions)] # I don't think this is entirely sound, but it should be reasonably # close def legacydispatcherarguments_exprs(args: Sequence[LegacyDispatcherArgument]) -> Sequence[DispatcherExpr]: - return cpparguments_exprs([CppArgument(type=a.type, name=a.name, default=None, argument=a.argument) for a in args]) + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + process_tensoroptions = ProcessTensoroptions.SCATTER + else: + process_tensoroptions = ProcessTensoroptions.PASS_THROUGH + return cpparguments_exprs([CppArgument(type=a.type, + name=a.name, + default=None, + argument=a.argument) for a in args], + process_tensoroptions=process_tensoroptions) + +def exprs(args: Sequence[DispatcherArgument]) -> Sequence[DispatcherExpr]: + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + process_tensoroptions = ProcessTensoroptions.SCATTER + else: + process_tensoroptions = ProcessTensoroptions.PASS_THROUGH + return cpparguments_exprs([CppArgument(type=a.type, + name=a.name, + default=None, + argument=a.argument) for a in args], + process_tensoroptions=process_tensoroptions) diff --git a/tools/codegen/api/legacy_dispatcher.py b/tools/codegen/api/legacy_dispatcher.py index db3d26c84fd0..160d39495951 100644 --- a/tools/codegen/api/legacy_dispatcher.py +++ b/tools/codegen/api/legacy_dispatcher.py @@ -71,4 +71,6 @@ def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Legacy assert_never(a) def arguments(func: FunctionSchema) -> Sequence[LegacyDispatcherArgument]: - return list(map(argument, cpp.group_arguments(func))) + signature_group = cpp.signature_group(func) + args = signature_group.signature_prefer_gathered().arguments + return list(map(argument, args)) diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index e4acb369f08e..48a2b3f56702 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -2,7 +2,7 @@ import contextlib import textwrap import itertools -from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, DefaultDict, Union, Sequence +from typing import List, Dict, Optional, Iterator, Tuple, Set, Callable, Any, TypeVar, Union, Sequence import yaml from enum import Enum from collections import OrderedDict @@ -14,6 +14,7 @@ from tools.codegen.model import * from tools.codegen.api.types import * import tools.codegen.api.cpp as cpp +from tools.codegen.api.cpp import CppSignature import tools.codegen.api.dispatcher as dispatcher import tools.codegen.api.legacy_dispatcher as legacy_dispatcher import tools.codegen.local as local @@ -46,14 +47,6 @@ # the dispatcher API, and the legacy disaptcher API. See each # of these respective files for more information - -# Note [Byte-for-byte compatibility] -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Some special cases we have made in this codegen have been strictly -# to make sure that git diff -w reports no changes, but we believe -# they are not semantically meaningful. After landing the new codegen, -# we should remove these special cases - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # HELPER FUNCTIONS @@ -111,8 +104,6 @@ def wrapper(f: NativeFunction) -> T: with context(f'in {f.loc}:\n {f.func}'): with local.parametrize( use_c10_dispatcher=f.use_c10_dispatcher, - # See Note [Byte-for-byte compatibility] - hack_const_mutable_self=str(f.func.name) in ["set_data", "retain_grad"], ): return func(f) return wrapper @@ -224,11 +215,7 @@ def func(f: NativeFunction) -> Optional[str]: args_exprs_str = ', '.join(map(lambda a: a.name, args)) - # See Note [Byte-for-byte compatibility] - # (return void_func() is valid C++) return_kw = " return " - if returns_type == "void": - return_kw = " " cuda_guard = "" if dispatch is None or 'CUDA' in dispatch or 'Vulkan' == dispatch: @@ -241,14 +228,6 @@ def func(f: NativeFunction) -> Optional[str]: # Only tensor like arguments are eligible device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) - # See Note [Byte-for-byte compatibility] - # I wasn't able to figure out the internal logic for - # these device guards - if str(f.func.name) == "_thnn_fused_lstm_cell_backward": - device_of = "cx" - elif str(f.func.name) == "_thnn_differentiable_lstm_cell_backward": - device_of = "input_gates" - has_tensor_options = any(isinstance(a.argument, TensorOptionsArguments) for a in args) # TODO: There is probably a simpler version of this that @@ -257,9 +236,6 @@ def func(f: NativeFunction) -> Optional[str]: cuda_guard = """\ const DeviceGuard device_guard(options.device()); """ - # See Note [Byte-for-byte compatibility] - if dispatch is not None: - cuda_guard = f"\n{cuda_guard}" elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options: cuda_guard = """\ globalContext().lazyInitCUDA(); @@ -269,16 +245,10 @@ def func(f: NativeFunction) -> Optional[str]: cuda_guard = f"""\ const OptionalDeviceGuard device_guard(device_of({device_of})); """ - # See Note [Byte-for-byte compatibility] - if dispatch is not None: - cuda_guard = f"\n{cuda_guard}" else: cuda_guard = """\ // DeviceGuard omitted """ - # See Note [Byte-for-byte compatibility] - if dispatch is not None: - cuda_guard = f"\n{cuda_guard}" return f"""\ {returns_type} {name}({args_str}) {{ @@ -290,7 +260,7 @@ def func(f: NativeFunction) -> Optional[str]: assert returns_type == dispatcher.returns_type(f.func.returns) dispatcher_args = dispatcher.arguments(f.func) dispatcher_args_types_str = ', '.join(map(lambda a: a.type, dispatcher_args)) - if dispatch is None: + if dispatch is None or dispatch == 'Math': type_name = f'TypeDefault::{name}' else: type_name = f'{dispatch}Type::{name}' @@ -304,14 +274,9 @@ def func(f: NativeFunction) -> Optional[str]: if not def_only and not f.manual_kernel_registration and (dispatch is not None or f.dispatch is None): # Figure out which signature the function is if local.use_c10_dispatcher() is UseC10Dispatcher.full: - # See Note [Byte-for-byte compatibility] - if dispatch is not None: - nl = "\n" - else: - nl = "" payload = "c10::impl::hacky_wrapper_for_legacy_signatures<" \ - f"{returns_type} ({dispatcher_args_types_str})>({nl}TORCH_FN({type_name}))" + f"{returns_type} ({dispatcher_args_types_str})>(TORCH_FN({type_name}))" else: payload = f"torch::CppFunction::makeUnboxedOnly(&{type_name})" @@ -336,6 +301,28 @@ def func(f: NativeFunction) -> Optional[str]: return func +# Return a string with a comma separated list of expressions that could be used +# to call this operator. This can be used to generate code that wraps operators +# and calls back into them. The process_tensoroptions argument determines how +# tensor options should be treated. They can be +# - PASS_THROUGH: Don't do anything, just handle them as regular arguments +# - SCATTER: Expect a `TensorOptions options` in the scope and scatter it into `options.dtype, ...` +# - GATHER: Expect `dtype, ...` in the scope and gather them into a TensorOptions for calling +def exprs_str(signature: CppSignature, + process_tensoroptions: dispatcher.ProcessTensoroptions = dispatcher.ProcessTensoroptions.PASS_THROUGH, + exclude_this: bool = False, + ) -> str: + args = signature.cpp_arguments() + if exclude_this: + args = [a for a in args if not isinstance(a.argument, ThisArgument)] + exprs = dispatcher.cpparguments_exprs(args, process_tensoroptions=process_tensoroptions) + return ', '.join(map(lambda a: a.expr, exprs)) + +def types_str(signature: CppSignature) -> str: + args = signature.cpp_arguments() + exprs = dispatcher.cpparguments_exprs(args, process_tensoroptions=dispatcher.ProcessTensoroptions.PASS_THROUGH) + return ', '.join(map(lambda a: a.type, exprs)) + # Generates Function.cpp and Function.h. These files provide the # functional public C++ API, and the scaffolding to call into # the dispatcher from these functions. See also compute_tensor_method. @@ -347,32 +334,73 @@ def go(f: NativeFunction) -> Optional[str]: if Variant.function not in f.variants: return None - name = cpp.name(f.func) - cpp_returns_type = cpp.returns_type(f.func.returns) - cpp_args = cpp.arguments(f.func) - cpp_args_str = ', '.join(map(str, cpp_args)) + cpp_name = cpp.name(f.func) + signature_group = cpp.signature_group(f.func, method=False) if target is Target.DECLARATION: - return f"CAFFE2_API {cpp_returns_type} {name}({cpp_args_str});" + if signature_group.gathered_signature is None: + # There's no TensorOptions + return f""" +CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=True)}); +""" + else: + # There's TensorOptions in the API. Create 2 APIs - one taking the TensorOptions object ("gathered_signature"), + # and one taking a scattered signature with ScalarType, Layout, Device separately ("signature"). + # The gathered_signature already exists in several older PyTorch versions and had default arguments. + # For backward compatibility, we left it unchanged and added the scattered API on top of it. + # Note that the scattered API cannot have default arguments or calls will be ambigious. + return f""" +CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=True)}); +CAFFE2_API {cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}); +""" assert target is Target.DEFINITION - dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args) - cpp_args_str_no_default = ', '.join(map(lambda a: a.str_no_default(), cpp_args)) dispatcher_returns_type = dispatcher.returns_type(f.func.returns) - dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs)) - dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs)) - return f""" + if signature_group.gathered_signature is None: + # There's no TensorOptions + return f""" +// aten::{f.func} +{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>(); + return op.call({exprs_str(signature_group.signature)}); +}} +""" + elif local.use_c10_dispatcher() is UseC10Dispatcher.full: + # for c10-full ops, the scattered version is the real op and the gathered version is a proxy + # calling into the scattered version + return f""" +// aten::{f.func} +{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>(); + return op.call({exprs_str(signature_group.signature)}); +}} +{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) {{ + return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.SCATTER)}); +}} +""" + else: + # for non-c10-full ops, the gathered version is the real op and the scattered version is a proxy + # calling into the gathered version + return f""" // aten::{f.func} -{cpp_returns_type} {name}({cpp_args_str_no_default}) {{ +{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) {{ static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") - .typed<{dispatcher_returns_type} ({dispatcher_types_str})>(); - return op.call({dispatcher_exprs_str}); + .typed<{dispatcher_returns_type} ({types_str(signature_group.gathered_signature)})>(); + return op.call({exprs_str(signature_group.gathered_signature)}); +}} +{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) {{ + return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.GATHER)}); }} """ + return go # Generates TensorBody.h (sic) and TensorMethods.cpp. These files provide the @@ -388,30 +416,78 @@ def go(f: NativeFunction) -> Optional[str]: assert len(f.func.arguments) > 0 assert sum(a.name == 'self' for a in f.func.arguments) == 1 - name = cpp.name(f.func) + cpp_name = cpp.name(f.func) cpp_returns_type = cpp.returns_type(f.func.returns) - cpp_args = cpp.arguments(f.func, method=True) - cpp_args_exclude_this = [a for a in cpp_args if not isinstance(a.argument, ThisArgument)] - cpp_args_exclude_this_str = ', '.join(str(a) for a in cpp_args_exclude_this) + signature_group = cpp.signature_group(f.func, method=True) if target is Target.DECLARATION: - return f"{cpp_returns_type} {name}({cpp_args_exclude_this_str}) const;" + if signature_group.gathered_signature is None: + # There's no TensorOptions. Just create the API without concern for TensorOptions. + return f"{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=True)}) const;" + else: + # There's TensorOptions in the API. Create 2 APIs - one taking the TensorOptions object ("gathered_signature"), + # and one taking a scattered signature with ScalarType, Layout, Device separately ("signature"). + # The gathered_signature already exists in several older PyTorch versions and had default arguments. + # For backward compatibility, we left it unchanged and added the scattered API on top of it. + # Note that the scattered API cannot have default arguments or calls will be ambigious. + return f""" +{cpp_returns_type} {cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=True)}) const; +{cpp_returns_type} {cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const; +""" assert target is Target.DEFINITION - dispatcher_exprs = dispatcher.cpparguments_exprs(cpp_args) - cpp_args_exclude_this_str_no_default = ', '.join(a.str_no_default() for a in cpp_args_exclude_this) dispatcher_returns_type = dispatcher.returns_type(f.func.returns) - dispatcher_types_str = ', '.join(map(lambda a: a.type, dispatcher_exprs)) - dispatcher_exprs_str = ', '.join(map(lambda a: a.expr, dispatcher_exprs)) - return f""" + result = f""" // aten::{f.func} -{cpp_returns_type} Tensor::{name}({cpp_args_exclude_this_str_no_default}) const {{ +{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{ static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") - .typed<{dispatcher_returns_type} ({dispatcher_types_str})>(); - return op.call({dispatcher_exprs_str}); + .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>(); + return op.call({exprs_str(signature_group.signature)}); +}} +""" + + if signature_group.gathered_signature is None: + # There's no TensorOptions + return f""" +// aten::{f.func} +{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>(); + return op.call({exprs_str(signature_group.signature)}); +}} +""" + elif local.use_c10_dispatcher() is UseC10Dispatcher.full: + # for c10-full ops, the scattered version is the real op and the gathered version is a proxy + # calling into the scattered version + return f""" +// aten::{f.func} +{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({types_str(signature_group.signature)})>(); + return op.call({exprs_str(signature_group.signature)}); +}} +{cpp_returns_type} Tensor::{cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) const {{ + return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.SCATTER, exclude_this=True)}); +}} +""" + else: + # for non-c10-full ops, the gathered version is the real op and the scattered version is a proxy + # calling into the gathered version + return f""" +// aten::{f.func} +{cpp_returns_type} Tensor::{cpp_name}({signature_group.gathered_signature.cpp_arguments_str(with_defaults=False)}) const {{ + static auto op = c10::Dispatcher::singleton() + .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") + .typed<{dispatcher_returns_type} ({types_str(signature_group.gathered_signature)})>(); + return op.call({exprs_str(signature_group.gathered_signature)}); +}} +{cpp_returns_type} Tensor::{cpp_name}({signature_group.signature.cpp_arguments_str(with_defaults=False)}) const {{ + return {cpp_name}({exprs_str(signature_group.gathered_signature, dispatcher.ProcessTensoroptions.GATHER, exclude_this=True)}); }} """ @@ -474,23 +550,35 @@ def go(f: NativeFunction) -> Optional[str]: dispatcher_returns_type = dispatcher.returns_type(f.func.returns) dispatcher_args = dispatcher.arguments(f.func) - dispatcher_exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args) + + args: Union[Sequence[DispatcherArgument], Sequence[LegacyDispatcherArgument]] + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + returns_type = dispatcher_returns_type + args = dispatcher_args + exprs = dispatcher.exprs(dispatcher_args) + dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" + else: + returns_type = legacy_dispatcher_returns_type + args = legacy_dispatcher_args + exprs = dispatcher.legacydispatcherarguments_exprs(legacy_dispatcher_args) + dispatch_key = "options.computeDispatchKey()" if target is Target.DEFINITION: - # See Note [Byte-for-byte compatibility] # I don't think there's actually a good reason to generate # these two cases differently + # The first case could probably be improved though- it calls dispatchTypeId(), + # which looks at TLS dispatch keys- there should not be any by the time we reach backend select. if legacy_dispatcher_tensor_args: tensor_args = ', '.join(a.name for a in legacy_dispatcher_tensor_args) compute_dk = f"""\ -DispatchKeySet _dk_set = DispatchKeySet(options.computeDispatchKey()) | c10::detail::multi_dispatch_key_set({tensor_args}); +DispatchKeySet _dk_set = c10::DispatchKeySet({dispatch_key}) | c10::detail::multi_dispatch_key_set({tensor_args}); DispatchKeySet _dk_mask = c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, DispatchKey::BackendSelect); DispatchKey _dk = c10::impl::dispatchTypeId(_dk_set, _dk_mask);""" else: - compute_dk = "DispatchKey _dk = options.computeDispatchKey();" + compute_dk = f"DispatchKey _dk = {dispatch_key};" return f"""\ // aten::{f.func} -{legacy_dispatcher_returns_type} {name}({', '.join(a.str_with_default() for a in legacy_dispatcher_args)}) {{ +{returns_type} {name}({', '.join(str(a) for a in args)}) {{ static auto op = c10::Dispatcher::singleton() .findSchemaOrThrow("aten::{f.func.name.name}", "{f.func.name.overload_name}") .typed<{dispatcher_returns_type} ({', '.join(a.type for a in dispatcher_args)})>(); @@ -499,7 +587,7 @@ def go(f: NativeFunction) -> Optional[str]: // This trick allows calling Autograd backend kernel first and then backend kernel, // without adding another AutogradBackendSelect dispatch key. DispatchKey _current_dk = at::impl::variable_excluded_from_dispatch() ? _dk : _autograd_dk; - return op.callWithDispatchKey(_current_dk, {', '.join(a.expr for a in dispatcher_exprs)}); + return op.callWithDispatchKey(_current_dk, {', '.join(a.expr for a in exprs)}); }} """ elif target is Target.REGISTRATION: @@ -638,23 +726,8 @@ def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[ name = f.func.out_arguments[i].name # If the return argument is explicitly named... elif r.name: - # See Note [Byte-for-byte compatibility] - # - # Check if it would conflict with an existing argument. - # Downstream codegen assumes that return names and argument - # names don't conflict with each other, so we disambiguate - # (by adding a trailing _return) this case. Notice that - # historically, the collision check was buggy: it just did a - # straight string contains test on the entirety of the - # inputs part of the format string, meaning that it also - # picked up occurrences of the argument name in the NAME of - # the function, as well as substring occurrences of the name - # in arguments. We have simulated the old logic here... - buggy_name_conflict = r.name in str(f.func.name) or \ - any(r.name in a.name for a in f.func.schema_order_arguments()) - # ... but a more correct version is simply - # name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments()) - if buggy_name_conflict and not f.func.is_out_fn(): + name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments()) + if name_conflict and not f.func.is_out_fn(): name = f'{r.name}_return' else: name = r.name @@ -715,20 +788,9 @@ def compute_argument_yaml(a: Argument, *, schema_order: bool, kwarg_only_set: Se arg['default'] = pythonify_default(cpp.default_expr(a.default, a.type)) if a.name in kwarg_only_set: arg['kwarg_only'] = True - # See Note [Byte-for-byte compatibility] - # The default value of kwarg_only is False; this case exists for - # byte-for-byte compatibility - elif a.name in out_arg_set: - arg['kwarg_only'] = False if a.name in out_arg_set: arg['output'] = True - # See Note [Byte-for-byte compatibility] - # This is probably a bug in the original implementation, where - # the specification of allocate was not properly propagated to - # the schema-order arguments. In any case, this field - # is redundant with the output field - if not schema_order: - arg['allocate'] = True + arg['allocate'] = True # See Note [name and field_name] if a.name in name_to_field_name: arg['field_name'] = name_to_field_name[a.name] @@ -748,7 +810,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object: kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments) out_arg_set = set(a.name for a in f.func.out_arguments) - cpp_args = cpp.arguments(f.func) + signature_group = cpp.signature_group(f.func) + cpp_args = signature_group.signature_prefer_gathered().cpp_arguments() arguments = [ compute_cpp_argument_yaml( cpp_a, schema_order=False, @@ -756,9 +819,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object: for cpp_a in cpp_args ] - # See Note [Byte-for-byte compatibility] - # NB: NOT actually schema order. This is almost certainly a BUG. - schema_order_jit_arguments = list(itertools.chain(f.func.arguments, f.func.out_arguments, f.func.kwarg_only_arguments)) + schema_order_jit_arguments = list(f.func.schema_order_arguments()) schema_order_arguments = [ compute_argument_yaml( @@ -811,8 +872,20 @@ def compute_declaration_yaml(f: NativeFunction) -> object: ('device_guard', f.device_guard), ('with_gil', False), ('deprecated', False), + ('has_math_kernel', f.dispatch is not None and 'Math' in f.dispatch), ]) +@with_native_function +def compute_registration_declarations(f: NativeFunction) -> str: + name = dispatcher.name(f.func) + returns_type = dispatcher.returns_type(f.func.returns) + args = dispatcher.arguments(f.func) + args_str = ', '.join(map(str, args)) + dispatch = f.dispatch is not None + math = dispatch and 'Math' in f.dispatch # type: ignore + return f"""{returns_type} {name}({args_str}); // {{"schema": "aten::{f.func}", "dispatch": "{dispatch}", "math": "{math}"}} +""" + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # RUN IT ALL @@ -913,11 +986,6 @@ def main() -> None: nargs='*', help='filter dispatch backend by the whitelist (if set), ' 'e.g.: CPU CUDA QuantizedCPU ...') - parser.add_argument( - '--per_op_registration', - action='store_true', - help='group function registrations by op name and write to separate files; ' - 'must also set --op_registration_whitelist param') parser.add_argument( '--force_schema_registration', action='store_true', @@ -1010,23 +1078,36 @@ def make_file_manager(install_dir: str) -> FileManager: 'function_registrations': list(mapMaybe( compute_type_method( dispatch, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions - )) if not options.per_op_registration else [], + native_functions)), }) del fm cpu_fm.write('TypeDefault.h', lambda: { - 'type_method_declarations': list(mapMaybe( + 'type_method_declarations': + list(mapMaybe( compute_type_method(None, target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist), + native_functions)) + + list(mapMaybe( + compute_type_method('Math', target=Target.DECLARATION, op_registration_whitelist=op_registration_whitelist), native_functions)), + }) cpu_fm.write('TypeDefault.cpp', lambda: { - 'type_method_definitions': list(mapMaybe( + 'type_method_definitions': + list(mapMaybe( compute_type_method(None, target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist), + native_functions)) + + list(mapMaybe( + compute_type_method('Math', target=Target.DEFINITION, op_registration_whitelist=op_registration_whitelist), native_functions)), + 'function_registrations': list(mapMaybe( compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), - native_functions)) if not options.per_op_registration else [], + native_functions)), + + 'math_function_registrations': list(mapMaybe( + compute_type_method('Math', target=Target.REGISTRATION, op_registration_whitelist=op_registration_whitelist), + native_functions)), }) cpu_fm.write('Functions.h', lambda: { 'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)), @@ -1058,53 +1139,15 @@ def computeSchemaRegister() -> Dict[str, object]: schema_registrations = list(mapMaybe( compute_type_method(None, target=Target.REGISTRATION, op_registration_whitelist=None, def_only=True), native_functions)) - # See Note [Byte-for-byte compatibility] - schema_registrations.sort() return { 'schema_registrations': schema_registrations, } cpu_fm.write('SchemaRegister.cpp', computeSchemaRegister) - if options.per_op_registration: - def gen_per_op_registration_filename(opname: str) -> str: - return 'pt_op_register_{}.cpp'.format(opname.replace(':', '-')) - - if op_registration_whitelist is None: - raise Exception("Must set --op_registration_whitelist for per-op registration.") - - # First, group all native functions by unoverloaded operator name - grouped_functions : DefaultDict[str, List[NativeFunction]] = DefaultDict(list) - for f in native_functions: - grouped_functions[f"aten::{f.func.name.name}"].append(f) - extra_headers = [] - for b in backends: - extra_headers.append(f'#include ') - - # Next, generate registration for each one - for name in op_registration_whitelist: - def computePerOpRegistration() -> Dict[str, object]: - fs = grouped_functions[name] - registrations: List[str] = [] - for mb_dispatch in itertools.chain([None], backends): - # or you could pass in op_registration_whitelist, it doesn't - # matter! - # NB: Use of compute_type_method here is kind of an abuse; - # this is why we have to unconditionally write in - # torch::dispatch in the registration when it should be - # contextually clear - registrations.extend( - mapMaybe( - compute_type_method(mb_dispatch, target=Target.REGISTRATION, op_registration_whitelist=None), - fs)) - return { - 'extra_headers': extra_headers, - 'function_registrations': registrations, - } - - cpu_fm.write_with_template( - gen_per_op_registration_filename(name), 'PerOpRegistration.cpp', computePerOpRegistration) - cpu_fm.write('Declarations.yaml', lambda: format_yaml(list(map(compute_declaration_yaml, native_functions)))) + cpu_fm.write('RegistrationDeclarations.h', lambda: { + 'registration_declarations': list(map(compute_registration_declarations, native_functions)), + }) if options.output_dependencies: cpu_fm.write_outputs(options.output_dependencies) diff --git a/tools/codegen/local.py b/tools/codegen/local.py index 9244cb181aec..41deef4884f0 100644 --- a/tools/codegen/local.py +++ b/tools/codegen/local.py @@ -18,7 +18,6 @@ class Locals(threading.local): use_c10_dispatcher: Optional[UseC10Dispatcher] = None - hack_const_mutable_self: bool = False _locals = Locals() # The use_c10_dispatcher field in native_functions.yaml is used to @@ -31,19 +30,11 @@ def use_c10_dispatcher() -> UseC10Dispatcher: "need to initialize local.use_c10_dispatcher with local.parametrize" return _locals.use_c10_dispatcher -# This is used to maintain compat, see Note [Byte-for-byte compatibility] -# It can be removed when we drop compat. -def hack_const_mutable_self() -> bool: - return _locals.hack_const_mutable_self - @contextmanager -def parametrize(*, use_c10_dispatcher: UseC10Dispatcher, hack_const_mutable_self: bool) -> Iterator[None]: +def parametrize(*, use_c10_dispatcher: UseC10Dispatcher) -> Iterator[None]: old_use_c10_dispatcher = _locals.use_c10_dispatcher - old_hack_const_mutable_self = _locals.hack_const_mutable_self try: _locals.use_c10_dispatcher = use_c10_dispatcher - _locals.hack_const_mutable_self = hack_const_mutable_self yield finally: _locals.use_c10_dispatcher = old_use_c10_dispatcher - _locals.hack_const_mutable_self = old_hack_const_mutable_self diff --git a/tools/codegen/model.py b/tools/codegen/model.py index b0c470c91b6a..7dd1f6ff505c 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -1,7 +1,7 @@ import re from dataclasses import dataclass -from typing import List, Sequence, Dict, Optional, Iterator, Tuple, Set, NoReturn +from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn from enum import Enum import itertools @@ -197,6 +197,8 @@ def __post_init__(self) -> None: "otherwise you will tickle a Python argument binding bug " \ "(which usually manifests itself as the result variable being undefined.)" +SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out')) + # The function schema is undoubtedly the most important data structure # in all of the codegen, as it defines the type signature for operators, # and most of the code generation we do is type directed (e.g., look at @@ -255,18 +257,17 @@ class FunctionSchema: # The name of the operator this function schema describes. name: 'OperatorName' - # NB: Sequence here is intentional, to make it read only - arguments: Sequence['Argument'] - kwarg_only_arguments: Sequence['Argument'] # but not including out args + arguments: Tuple['Argument', ...] + kwarg_only_arguments: Tuple['Argument', ...] # but not including out args # Unlike in the previous codegen, we have factored out 'out' arguments # in the canonical representation, removing them from kwarg # arguments. This choice is justified by numerous downstream # transformations which treat out arguments specially; additionally, # you can see that canonicity is not violated! - out_arguments: Sequence['Argument'] # these are also kwarg-only + out_arguments: Tuple['Argument', ...] # these are also kwarg-only # TODO: Need to handle collisions with argument names at some point - returns: Sequence['Return'] + returns: Tuple['Return', ...] def schema_order_arguments(self) -> Iterator['Argument']: return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments) @@ -303,7 +304,11 @@ def __post_init__(self) -> None: if self.name.name.inplace: # TODO: fixme if str(self.name) not in [ - '_amp_non_finite_check_and_unscale_', + '_amp_foreach_non_finite_check_and_unscale_', + '_foreach_add_scalar_list_', + '_foreach_sub_scalar_list_', + '_foreach_mul_scalar_list_', + '_foreach_div_scalar_list_', '_foreach_add_.Scalar', '_foreach_sub_.Scalar', '_foreach_mul_.Scalar', @@ -347,6 +352,76 @@ def is_out_fn(self) -> bool: # we only do this check in tools/ return bool(self.out_arguments) + def kind(self) -> SchemaKind: + """ + What kind of schema is this? A functional schema is one + that returns a newly allocated output; an inplace schema + modifies the self argument inplace; an out schema writes + the result into an explicitly provided out argument. + """ + is_inplace = self.name.name.inplace + is_out = bool(self.out_arguments) + assert not (is_inplace and is_out) + if is_inplace: + return SchemaKind.inplace + elif is_out: + return SchemaKind.out + else: + return SchemaKind.functional + + # WARNING: This method is not currently tested in any meaningful way + def signature(self) -> 'FunctionSchema': + """ + Certain schemas are 'related', in that they are simply + inplace/out/functional versions of the same function. This method + factors these schemas into the "core" functional signature which + is equal across all versions. + + Here is what normalization happens to the schema to convert + it to a signature: + - The overload name is stripped (name is retained, since + it expresses semantic content about what the function does) + - Inplace is set False + - Out arguments are stripped + - Mutability annotations are stripped (this is sound + because you cannot overload on mutability annotation) + + This function is based off of get_signature in + tools.autograd.load_derivatives + """ + + # dataclasses.replace could be used here, but it is less + # type safe so for now I've opted to type everything out + def strip_arg_annotation(a: Argument) -> Argument: + return Argument( + name=a.name, + type=a.type, + default=a.default, # hmmm + annotation=None, + ) + + def strip_ret_annotation(r: Return) -> Return: + return Return( + name=r.name, + type=r.type, + annotation=None, + ) + + return FunctionSchema( + name=OperatorName( + name=BaseOperatorName( + base=self.name.name.base, + inplace=False, + dunder_method=self.name.name.dunder_method, + ), + overload_name="", # stripped + ), + arguments=tuple(map(strip_arg_annotation, self.arguments)), + kwarg_only_arguments=tuple(map(strip_arg_annotation, self.kwarg_only_arguments)), + out_arguments=(), # stripped + returns=tuple(map(strip_ret_annotation, self.returns)), + ) + def __str__(self) -> str: all_arguments: List[str] = [] all_arguments.extend(map(str, self.arguments)) @@ -372,14 +447,14 @@ def __str__(self) -> str: class Annotation: # Typically only has one element. Not actually a set so # we can conveniently assume it is canonically ordered - alias_set: Sequence[str] + alias_set: Tuple[str, ...] is_write: bool @staticmethod def parse(ann: str) -> 'Annotation': m = re.match(r'^([a-z])(!?)$', ann) assert m is not None, f'unrecognized alias annotation {ann}' - alias_set = [m.group(1)] + alias_set = (m.group(1),) is_write = m.group(2) == '!' r = Annotation(alias_set=alias_set, is_write=is_write) assert str(r) == ann, f'{r} != {ann}' @@ -725,21 +800,18 @@ def __str__(self) -> str: # Helper functions for parsing argument lists (both inputs and returns) -def parse_returns(return_decl: str) -> Sequence[Return]: +def parse_returns(return_decl: str) -> Tuple[Return, ...]: """ Input: '()' Output: [] """ if return_decl == '()': - return [] + return () if return_decl[0] == '(' and return_decl[-1] == ')': return_decl = return_decl[1:-1] - returns = [] - for arg in return_decl.split(', '): - returns.append(Return.parse(arg)) - return returns + return tuple(Return.parse(arg) for arg in return_decl.split(', ')) -def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument], Sequence[Argument]]: +def parse_arguments(args: str) -> Tuple[Tuple[Argument, ...], Tuple[Argument, ...], Tuple[Argument, ...]]: """ Input: 'int x, int y, int z' Output: positional args, kwarg only args @@ -774,4 +846,4 @@ def parse_arguments(args: str) -> Tuple[Sequence[Argument], Sequence[Argument], assert arguments_acc is not out_arguments arguments_acc.append(parg) - return arguments, kwarg_only_arguments, out_arguments + return tuple(arguments), tuple(kwarg_only_arguments), tuple(out_arguments) diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 4b91abf1c6c7..576a0b39f501 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -1,4 +1,4 @@ -from __future__ import print_function + import os import collections from pprint import pformat @@ -74,11 +74,7 @@ # Somehow, these are defined in both _C and in functional. Ick! 'broadcast_tensors', # Manually define named tensor type stubs in __init__.pyi.in - 'rename', - 'refine_names', - 'align_to', 'align_tensors', - 'unflatten', 'meshgrid', 'cartesian_prod', 'block_diag', @@ -87,7 +83,6 @@ 'stft', 'istft', 'tensordot', - 'norm', 'split', 'unique_consecutive', 'atleast_1d', @@ -151,6 +146,7 @@ def type_to_python(typename, size=None): 'Dimname': 'Union[str, ellipsis, None]', 'DimnameList': 'Sequence[Union[str, ellipsis, None]]', 'QScheme': '_qscheme', + 'ArrayRef' : 'Sequence[float]' }[typename] return typename @@ -404,6 +400,14 @@ def gen_nn_functional(out): } write(out, 'torch/nn/functional.pyi', stubs, env) + # functional.pyi already contains the definitions for those functions + # so, we don't export then to it + from_c.extend(['hardtanh', 'leaky_relu', 'hardsigmoid']) + dispatch_code = ["{}: Callable".format(_) for _ in (dispatches + from_c)] + env = { + 'imported_hints': import_code, + 'dispatched_hints': dispatch_code + } stubs = CodeTemplate.from_file(os.path.join('torch', '_C', '_nn.pyi.in')) write(out, 'torch/_C/_nn.pyi', stubs, env) @@ -470,10 +474,12 @@ def gen_pyi(declarations_path, out): ' generator: Optional[Generator]=None, {}) -> Tensor: ...' .format(FACTORY_PARAMS)], 'full': ['def full(size: _size, fill_value: Number, *,' - ' out: Optional[Tensor]=None, {}) -> Tensor: ...' + ' out: Optional[Tensor]=None,' + ' layout: _layout=strided, {}) -> Tensor: ...' .format(FACTORY_PARAMS), 'def full(size: _size, fill_value: Number, *,' - ' names: List[Union[str, None]], {}) -> Tensor: ...' + ' names: List[Union[str, None]],' + ' layout: _layout=strided, {}) -> Tensor: ...' .format(FACTORY_PARAMS)], 'is_grad_enabled': ['def is_grad_enabled() -> _bool: ...'], 'nonzero': ['def nonzero(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ...', @@ -536,6 +542,7 @@ def gen_pyi(declarations_path, out): 'def __init__(self, other: Tensor) -> None: ...', 'def __init__(self, size: {}, *, {}) -> None: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM), ], + 'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."], # clamp has no default values in the Declarations 'clamp': ["def clamp(self, min: _float=-inf, max: _float=inf," " *, out: Optional[Tensor]=None) -> Tensor: ..."], @@ -546,6 +553,7 @@ def gen_pyi(declarations_path, out): 'tolist': ['def tolist(self) -> List: ...'], 'requires_grad_': ['def requires_grad_(self, mode: _bool=True) -> Tensor: ...'], 'element_size': ['def element_size(self) -> _int: ...'], + 'data_ptr': ['def data_ptr(self) -> _int: ...'], 'dim': ['def dim(self) -> _int: ...'], 'nonzero': ['def nonzero(self, *, as_tuple: _bool=...) -> Tensor: ...'], 'numel': ['def numel(self) -> _int: ...'], @@ -576,6 +584,10 @@ def gen_pyi(declarations_path, out): ], 'item': ["def item(self) -> Number: ..."], 'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."], + 'set_': ['def set_(self, storage: Storage, offset: _int, size: _size, stride: _size) -> Tensor: ...', + 'def set_(self, storage: Storage) -> Tensor: ...'], + 'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...', + 'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'], }) for binop in ['mul', 'div', 'true_divide', 'floor_divide']: for inplace in [False, True]: @@ -632,7 +644,7 @@ def gen_pyi(declarations_path, out): for c in ('Double', 'Float', 'Long', 'Int', 'Short', 'Char', 'Byte', 'Bool', 'Half', 'BFloat16', 'ComplexDouble', - 'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32'): + 'ComplexFloat', 'QUInt8', 'QInt8', 'QInt32', 'QUInt4x2'): legacy_storage_base_hints.append('class {}StorageBase(object): ...'.format(c)) legacy_class_hints = [] @@ -650,7 +662,7 @@ def gen_pyi(declarations_path, out): ['float32', 'float', 'float64', 'double', 'float16', 'bfloat16', 'half', 'uint8', 'int8', 'int16', 'short', 'int32', 'int', 'int64', 'long', 'complex32', 'complex64', 'cfloat', 'complex128', 'cdouble', - 'quint8', 'qint8', 'qint32', 'bool']] + 'quint8', 'qint8', 'qint32', 'bool', 'quint4x2']] # Generate __all__ directive # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index d5db749d1552..abbfb6e7a65f 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -1,6 +1,6 @@ "Manages CMake." -from __future__ import print_function + import multiprocessing import os @@ -245,6 +245,7 @@ def generate(self, version, cmake_python_library, build_python, build_test, my_e 'MKL_THREADING', 'MKLDNN_CPU_RUNTIME', 'MSVC_Z7_OVERRIDE', + 'CAFFE2_USE_MSVC_STATIC_RUNTIME', 'Numa_INCLUDE_DIR', 'Numa_LIBRARIES', 'ONNX_ML', diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index b78dc4a362a7..f64025c34683 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -46,7 +46,7 @@ append_filelist("libtorch_python_core_sources" TORCH_PYTHON_SRCS) # NB: This has to match the condition under which the JIT test directory # is included (at the time of writing that's in caffe2/CMakeLists.txt). -if(BUILD_TEST AND NOT USE_ROCM) +if(BUILD_TEST) add_definitions(-DBUILDING_TESTS) list(APPEND TORCH_PYTHON_SRCS ${TORCH_ROOT}/test/cpp/jit/torch_python_test.cpp @@ -66,6 +66,9 @@ set(TORCH_PYTHON_INCLUDE_DIRECTORIES ${CMAKE_BINARY_DIR}/third_party ${CMAKE_BINARY_DIR}/third_party/onnx + ${TORCH_ROOT}/third_party/valgrind/callgrind + ${TORCH_ROOT}/third_party/valgrind/include + ${TORCH_ROOT}/third_party/gloo ${TORCH_ROOT}/third_party/onnx ${pybind11_INCLUDE_DIRS} @@ -160,25 +163,28 @@ endif() if(USE_DISTRIBUTED) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) - if(NOT MSVC) + if(WIN32) + append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) + else() + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC) append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) - # Disable certain warnings for GCC-9.X - if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") - endif() - list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) - if(USE_TENSORPIPE) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) - list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) - endif() endif() + # Disable certain warnings for GCC-9.X + if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0)) + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") + endif() + if(USE_TENSORPIPE) + list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) + list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) + endif() + list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) + list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) endif() -if(USE_NCCL) +if(USE_NCCL AND NOT WIN32) list(APPEND TORCH_PYTHON_SRCS ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 0d48ea710fdd..9ccc5f7cb899 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -28,7 +28,7 @@ class device: # THPDevice_pynew @overload - def __init__(self, device: Union[_int, str]) -> None: ... + def __init__(self, device: Union[_device, _int, str]) -> None: ... @overload def __init__(self, type: str, index: _int) -> None: ... @@ -87,6 +87,9 @@ ${dtype_class_hints} class layout: ... +# Defined in torch/csrc/utils/disable_torch_function.cpp +def DisableTorchFunction(): ... + # Defined in torch/csrc/utils/tensor_layouts.cpp strided : layout = ... sparse_coo : layout = ... @@ -103,8 +106,12 @@ preserve_format: memory_format = ... # Defined in torch/csrc/QScheme.cpp class qscheme: ... -# Defined in torch/csrc/utils/tensor_qschemes.cpp +# Defined in torch/csrc/utils/tensor_qschemes.h per_tensor_affine: qscheme = ... +per_channel_affine: qscheme = ... +per_tensor_symmetric: qscheme = ... +per_channel_symmetric: qscheme = ... +per_channel_affine_float_qparams: qscheme = ... # Defined in torch/csrc/autograd/python_function.cpp class _FunctionBase(object): @@ -132,6 +139,8 @@ class Future(object): def then(self, callback: Callable) -> Future: ... def set_result(self, result: Any) -> None: ... +def _jit_set_num_profiled_runs(num: _size) -> _size: ... + # Defined in torch/csrc/jit/passes/xnnpack_rewrite.h class MobileOptimizerType: ... @@ -170,6 +179,20 @@ def _jit_set_texpr_fuser_enabled(enable: _bool): ... def _jit_set_nvfuser_enabled(enable: _bool) -> _bool: ... def _jit_pass_canonicalize(graph: Graph): ... def _jit_pass_erase_shape_information(graph: Graph): ... +def _jit_pass_fold_convbn(module: 'torch.jit.ScriptModule'): ... +def _jit_pass_insert_observers(module: 'torch.jit.ScriptModule', + method_name: str, + qconfig_dict: Dict[str, Any], + inplace: _bool, + quant_type: _int): ... +def _jit_pass_insert_quant_dequant(module: 'torch.jit.ScriptModule', + method_name: str, + inplace: _bool, + debug: _bool, + quant_type: _int): ... +def _jit_pass_quant_finalize(module: 'torch.jit.ScriptModule', + quant_type: _int, + preserved_attrs: Sequence[str]): ... def _jit_set_profiling_executor(profiling_flag: _bool) -> _bool: ... def _jit_set_profiling_mode(profiling_flag: _bool) -> _bool: ... def _jit_try_infer_type(obj: Any) -> JitType: ... @@ -224,6 +247,7 @@ def _jit_script_compile( def _jit_script_class_compile( qual_name: str, definition: ClassDef, + defaults: Dict[str, Dict[str, Any]], rcb: ResolutionCallback, ): ... def _parse_source_def(src: str) -> Def: ... @@ -364,6 +388,10 @@ def _vmapmode_increment_nesting() -> _int: ... # THPModule_vmapmode_increment_n def _vmapmode_decrement_nesting() -> _int: ... # THPModule_vmapmode_decrement_nesting def _log_api_usage_once(str) -> None: ... # LogAPIUsageOnceFromPython +# Defined in `valgrind.h` and `callgrind.h` respecitively. +def valgrind_supported_platform() -> _bool: ... # NVALGRIND +def valgrind_toggle() -> None: ... # CALLGRIND_TOGGLE_COLLECT + has_openmp: _bool has_mkl: _bool has_lapack: _bool @@ -379,8 +407,8 @@ def is_grad_enabled() -> _bool: ... def set_autocast_enabled(enabled: _bool) -> None: ... def is_autocast_enabled() -> _bool: ... def clear_autocast_cache() -> None: ... -def autocast_increment_nesting() -> None: ... -def autocast_decrement_nesting() -> None: ... +def autocast_increment_nesting() -> _int: ... +def autocast_decrement_nesting() -> _int: ... def set_anomaly_enabled(enabled: _bool) -> None: ... def is_anomaly_enabled() -> _bool: ... @@ -489,6 +517,7 @@ class _TensorBase(object): def _cuda_getCurrentStream(device: _int) -> _int: ... def _cuda_getDefaultStream(device: _int) -> _int: ... def _cuda_getCurrentBlasHandle() -> _int: ... +def _cuda_setDevice(device: _int) -> None: ... def _cuda_setStream(cuda_stream: _int) -> None: ... def _cuda_getCompiledVersion() -> _int: ... def _cuda_cudaHostAllocator() -> _int: ... @@ -503,6 +532,32 @@ def _cuda_lock_mutex() -> None: ... def _cuda_unlock_mutex() -> None: ... def _nccl_version() -> _int: ... def _nccl_unique_id() -> bytes: ... +def _nccl_init_rank(nranks: _int, comm_id: bytes, rank: _int) -> object: ... +def _nccl_reduce(input: Sequence[Tensor], + output: Tensor, + root: _int, + op: _int, + streams: Optional[Sequence[_CudaStreamBase]], + comms: Optional[Sequence[object]]) -> None: ... +def _nccl_all_reduce(input: Sequence[Tensor], + output: Sequence[Tensor], + op: _int, + streams: Optional[Sequence[_CudaStreamBase]], + comms: Optional[Sequence[object]]) -> None: ... +def _nccl_broadcast(input: Sequence[Tensor], + root: _int, + streams: Optional[Sequence[_CudaStreamBase]], + comms: Optional[Sequence[object]]) -> None: ... +def _nccl_all_gather(input: Sequence[Tensor], + output: Sequence[Tensor], + streams: Optional[Sequence[_CudaStreamBase]], + comms: Optional[Sequence[object]]) -> None: ... +def _nccl_reduce_scatter(input: Sequence[Tensor], + output: Sequence[Tensor], + op: _int, + streams: Optional[Sequence[_CudaStreamBase]], + comms: Optional[Sequence[object]]) -> None: ... + class _CudaDeviceProperties: name: str @@ -515,6 +570,7 @@ class _CudaDeviceProperties: # Defined in torch/csrc/cuda/Stream.cpp class _CudaStreamBase: + _cdata: _int device: _device cuda_stream: _int priority: _int @@ -653,6 +709,8 @@ class EnumType(JitType): class TensorType(JitType): @classmethod def get(cls) -> TensorType: ... + @classmethod + def getInferred(cls) -> TensorType: ... # Defined in torch/csrc/jit/python/python_tree_views.cpp class SourceRange: diff --git a/torch/__init__.py b/torch/__init__.py index 6523ab126c0d..1ca766fa77ca 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -12,6 +12,7 @@ import os import sys import platform +import textwrap import ctypes if sys.version_info < (3,): @@ -193,6 +194,31 @@ def _load_global_deps(): if TYPE_CHECKING: import torch._C as _C +# Check to see if we can load C extensions, and if not provide some guidance +# on what the problem might be. +try: + # _initExtension is chosen (arbitrarily) as a sentinel. + from torch._C import _initExtension +except ImportError: + import torch._C as _C_for_compiled_check + + # The __file__ check only works for Python 3.7 and above. + if sys.version_info >= (3, 7) and _C_for_compiled_check.__file__ is None: + raise ImportError(textwrap.dedent(''' + Failed to load PyTorch C extensions: + It appears that PyTorch has loaded the `torch/_C` folder + of the PyTorch repository rather than the C extensions which + are expected in the `torch._C` namespace. This can occur when + using the `install` workflow. e.g. + $ python setup.py install && python -c "import torch" + + This error can generally be solved using the `develop` workflow + $ python setup.py develop && python -c "import torch" # This should succeed + or by running Python from a different directory. + ''').strip()) from None + raise # If __file__ is not None the cause is unknown, so just re-raise. + + __all__ += [name for name in dir(_C) if name[0] != '_' and not name.endswith('Base')] @@ -300,14 +326,16 @@ def set_default_dtype(d): _C._set_default_dtype(d) def set_deterministic(d): - r""" Sets whether native PyTorch operations must use deterministic - algorithms. When True, operations without deterministic algorithms - will throw a :class:RuntimeError when called. + r""" Sets whether PyTorch operations must use "deterministic" + algorithms. That is, algorithms which, given the same input, and when + run on the same software and hardware, always produce the same output. + When True, operations will use deterministic algorithms when available, + and if only nondeterministic algorithms are available they will throw a + :class:RuntimeError when called. .. warning:: - This feature is a beta feature, so it does not affect every - nondeterministic operation yet. The following operations are - affected by this flag. + This feature is in beta, and its design and implementation may change + in the future. The following normally-nondeterministic operations will act deterministically when `d=True`: @@ -439,11 +467,13 @@ class QInt8Storage(_C.QInt8StorageBase, _StorageBase): class QInt32Storage(_C.QInt32StorageBase, _StorageBase): pass +class QUInt4x2Storage(_C.QUInt4x2StorageBase, _StorageBase): + pass _storage_classes = { DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage, CharStorage, ByteStorage, HalfStorage, BoolStorage, QUInt8Storage, QInt8Storage, - QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage + QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage } # The _tensor_classes set is initialized by the call to _C._initialize_tensor_type_bindings() @@ -477,9 +507,9 @@ def manager_path(): # is not a good way to fix this problem. Perhaps, try to redesign VariableFunctions # so that this import is good enough if TYPE_CHECKING: - # Some type signatures pulled in from _VariableFunctions here clash with + # Some type signatures pulled in from _VariableFunctions here clash with # signatures already imported. For now these clashes are ignored; see - # PR #43339 for details. + # PR #43339 for details. from torch._C._VariableFunctions import * # type: ignore for name in dir(_C._VariableFunctions): @@ -512,6 +542,7 @@ def manager_path(): del BFloat16StorageBase del ComplexDoubleStorageBase del ComplexFloatStorageBase +del QUInt4x2StorageBase ################################################################################ # Import most common subpackages @@ -526,6 +557,7 @@ def manager_path(): import torch.nn.intrinsic import torch.nn.quantized import torch.optim +import torch.optim._multi_tensor import torch.multiprocessing import torch.sparse import torch.utils.backcompat @@ -586,3 +618,12 @@ def compiled_with_cxx11_abi(): # class usage. We add these lines here to preserve backward compatbility. quantized_lstm = torch.ops.aten.quantized_lstm quantized_gru = torch.ops.aten.quantized_gru + +from .overrides import has_torch_function, handle_torch_function + +def Assert(condition, message): + r"""A wrapper around Python's assert which is symbolically traceable. + """ + if type(condition) is not torch.Tensor and has_torch_function((condition,)): + return handle_torch_function(Assert, (condition,), condition, message) + assert condition, message diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 5fa2ee639a9f..e9fb21c5e854 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -390,6 +390,15 @@ def forward(self, x): # exception raised m(torch.rand(100)) """ + if isinstance(fn, property): + prop = fn + setattr(prop.fget, "_torchscript_modifier", FunctionModifiers.UNUSED) # noqa: B010 + + if prop.fset: + setattr(prop.fset, "_torchscript_modifier", FunctionModifiers.UNUSED) # noqa: B010 + + return prop + fn._torchscript_modifier = FunctionModifiers.UNUSED return fn diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py index b0cbf45b252b..ec0ad81dced0 100644 --- a/torch/_lobpcg.py +++ b/torch/_lobpcg.py @@ -13,23 +13,343 @@ __all__ = ['lobpcg'] +def _symeig_backward_complete_eigenspace(D_grad, U_grad, A, D, U): + # compute F, such that F_ij = (d_j - d_i)^{-1} for i != j, F_ii = 0 + F = D.unsqueeze(-2) - D.unsqueeze(-1) + F.diagonal(dim1=-2, dim2=-1).fill_(float('inf')) + F.pow_(-1) + + # A.grad = U (D.grad + (U^T U.grad * F)) U^T + Ut = U.transpose(-1, -2).contiguous() + res = torch.matmul( + U, + torch.matmul( + torch.diag_embed(D_grad) + torch.matmul(Ut, U_grad) * F, + Ut + ) + ) + + return res + + +def _polynomial_coefficients_given_roots(roots): + """ + Given the `roots` of a polynomial, find the polynomial's coefficients. + + If roots = (r_1, ..., r_n), then the method returns + coefficients (a_0, a_1, ..., a_n (== 1)) so that + p(x) = (x - r_1) * ... * (x - r_n) + = x^n + a_{n-1} * x^{n-1} + ... a_1 * x_1 + a_0 + + Note: for better performance requires writing a low-level kernel + """ + poly_order = roots.shape[-1] + poly_coeffs_shape = list(roots.shape) + # we assume p(x) = x^n + a_{n-1} * x^{n-1} + ... + a_1 * x + a_0, + # so poly_coeffs = {a_0, ..., a_n, a_{n+1}(== 1)}, + # but we insert one extra coefficient to enable better vectorization below + poly_coeffs_shape[-1] += 2 + poly_coeffs = roots.new_zeros(poly_coeffs_shape) + poly_coeffs[..., 0] = 1 + poly_coeffs[..., -1] = 1 + + # perform the Horner's rule + for i in range(1, poly_order + 1): + # note that it is computationally hard to compute backward for this method, + # because then given the coefficients it would require finding the roots and/or + # calculating the sensitivity based on the Vieta's theorem. + # So the code below tries to circumvent the explicit root finding by series + # of operations on memory copies imitating the Horner's method. + # The memory copies are required to construct nodes in the computational graph + # by exploting the explicit (not in-place, separate node for each step) + # recursion of the Horner's method. + # Needs more memory, O(... * k^2), but with only O(... * k^2) complexity. + poly_coeffs_new = poly_coeffs.clone() if roots.requires_grad else poly_coeffs + out = poly_coeffs_new.narrow(-1, poly_order - i, i + 1) + out -= roots.narrow(-1, i - 1, 1) * poly_coeffs.narrow(-1, poly_order - i + 1, i + 1) + poly_coeffs = poly_coeffs_new + + return poly_coeffs.narrow(-1, 1, poly_order + 1) + + +def _polynomial_value(poly, x, zero_power, transition): + """ + A generic method for computing poly(x) using the Horner's rule. -def lobpcg(A, # type: Tensor - k=None, # type: Optional[int] - B=None, # type: Optional[Tensor] - X=None, # type: Optional[Tensor] - n=None, # type: Optional[int] - iK=None, # type: Optional[Tensor] - niter=None, # type: Optional[int] - tol=None, # type: Optional[float] - largest=None, # type: Optional[bool] - method=None, # type: Optional[str] - tracker=None, # type: Optional[None] - ortho_iparams=None, # type: Optional[Dict[str, int]] - ortho_fparams=None, # type: Optional[Dict[str, float]] - ortho_bparams=None, # type: Optional[Dict[str, bool]] - ): - # type: (...) -> Tuple[Tensor, Tensor] + Arguments: + poly (Tensor): the (possibly batched) 1D Tensor representing + polynomial coefficients such that + poly[..., i] = (a_{i_0}, ..., a{i_n} (==1)), and + poly(x) = poly[..., 0] * zero_power + ... + poly[..., n] * x^n + + x (Tensor): the value (possible batched) to evalate the polynomial `poly` at. + + zero_power (Tensor): the represenation of `x^0`. It is application-specific. + + transition (Callable): the function that accepts some intermediate result `int_val`, + the `x` and a specific polynomial coefficient + `poly[..., k]` for some iteration `k`. + It basically performs one iteration of the Horner's rule + defined as `x * int_val + poly[..., k] * zero_power`. + Note that `zero_power` is not a parameter, + because the step `+ poly[..., k] * zero_power` depends on `x`, + whether it is a vector, a matrix, or something else, so this + functionality is delegated to the user. + """ + + res = zero_power.clone() + for k in range(poly.size(-1) - 2, -1, -1): + res = transition(res, x, poly[..., k]) + return res + +def _matrix_polynomial_value(poly, x, zero_power=None): + """ + Evaluates `poly(x)` for the (batched) matrix input `x`. + Check out `_polynomial_value` function for more details. + """ + + # matrix-aware Horner's rule iteration + def transition(curr_poly_val, x, poly_coeff): + res = x.matmul(curr_poly_val) + res.diagonal(dim1=-2, dim2=-1).add_(poly_coeff.unsqueeze(-1)) + return res + + if zero_power is None: + zero_power = torch.eye(x.size(-1), x.size(-1), dtype=x.dtype, device=x.device) \ + .view(*([1] * len(list(x.shape[:-2]))), x.size(-1), x.size(-1)) + + return _polynomial_value(poly, x, zero_power, transition) + +def _vector_polynomial_value(poly, x, zero_power=None): + """ + Evaluates `poly(x)` for the (batched) vector input `x`. + Check out `_polynomial_value` function for more details. + """ + + # vector-aware Horner's rule iteration + def transition(curr_poly_val, x, poly_coeff): + res = torch.addcmul(poly_coeff.unsqueeze(-1), x, curr_poly_val) + return res + + if zero_power is None: + zero_power = x.new_ones(1).expand(x.shape) + + return _polynomial_value(poly, x, zero_power, transition) + +def _symeig_backward_partial_eigenspace(D_grad, U_grad, A, D, U, largest): + # compute a projection operator onto an orthogonal subspace spanned by the + # columns of U defined as (I - UU^T) + Ut = U.transpose(-2, -1).contiguous() + proj_U_ortho = -U.matmul(Ut) + proj_U_ortho.diagonal(dim1=-2, dim2=-1).add_(1) + + # compute U_ortho, a basis for the orthogonal complement to the span(U), + # by projecting a random [..., m, m - k] matrix onto the subspace spanned + # by the columns of U. + # + # fix generator for determinism + gen = torch.Generator(A.device) + + # orthogonal complement to the span(U) + U_ortho = proj_U_ortho.matmul( + torch.randn( + (*A.shape[:-1], A.size(-1) - D.size(-1)), + dtype=A.dtype, + device=A.device, + generator=gen + ) + ) + U_ortho_t = U_ortho.transpose(-2, -1).contiguous() + + # compute the coefficients of the characteristic polynomial of the tensor D. + # Note that D is diagonal, so the diagonal elements are exactly the roots + # of the characteristic polynomial. + chr_poly_D = _polynomial_coefficients_given_roots(D) + + # the code belows finds the explicit solution to the Sylvester equation + # U_ortho^T A U_ortho dX - dX D = -U_ortho^T A U + # and incorporates it into the whole gradient stored in the `res` variable. + # + # Equivalent to the following naive implementation: + # res = A.new_zeros(A.shape) + # p_res = A.new_zeros(*A.shape[:-1], D.size(-1)) + # for k in range(1, chr_poly_D.size(-1)): + # p_res.zero_() + # for i in range(0, k): + # p_res += (A.matrix_power(k - 1 - i) @ U_grad) * D.pow(i).unsqueeze(-2) + # res -= chr_poly_D[k] * (U_ortho @ poly_D_at_A.inverse() @ U_ortho_t @ p_res @ U.t()) + # + # Note that dX is a differential, so the gradient contribution comes from the backward sensitivity + # Tr(f(U_grad, D_grad, A, U, D)^T dX) = Tr(g(U_grad, A, U, D)^T dA) for some functions f and g, + # and we need to compute g(U_grad, A, U, D) + # + # The naive implementation is based on the paper + # Hu, Qingxi, and Daizhan Cheng. + # "The polynomial solution to the Sylvester matrix equation." + # Applied mathematics letters 19.9 (2006): 859-864. + # + # We can modify the computation of `p_res` from above in a more efficient way + # p_res = U_grad * (chr_poly_D[1] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k)).unsqueeze(-2) + # + A U_grad * (chr_poly_D[2] * D.pow(0) + ... + chr_poly_D[k] * D.pow(k - 1)).unsqueeze(-2) + # + ... + # + A.matrix_power(k - 1) U_grad * chr_poly_D[k] + # Note that this saves us from redundant matrix products with A (elimination of matrix_power) + U_grad_projected = U_grad + series_acc = U_grad_projected.new_zeros(U_grad_projected.shape) + for k in range(1, chr_poly_D.size(-1)): + poly_D = _vector_polynomial_value(chr_poly_D[..., k:], D) + series_acc += U_grad_projected * poly_D.unsqueeze(-2) + U_grad_projected = A.matmul(U_grad_projected) + + # compute chr_poly_D(A) which essentially is: + # + # chr_poly_D_at_A = A.new_zeros(A.shape) + # for k in range(chr_poly_D.size(-1)): + # chr_poly_D_at_A += chr_poly_D[k] * A.matrix_power(k) + # + # Note, however, for better performance we use the Horner's rule + chr_poly_D_at_A = _matrix_polynomial_value(chr_poly_D, A) + + # compute the action of `chr_poly_D_at_A` restricted to U_ortho_t + chr_poly_D_at_A_to_U_ortho = torch.matmul( + U_ortho_t, + torch.matmul( + chr_poly_D_at_A, + U_ortho + ) + ) + # we need to invert 'chr_poly_D_at_A_to_U_ortho`, for that we compute its + # Cholesky decomposition and then use `torch.cholesky_solve` for better stability. + # Cholesky decomposition requires the input to be positive-definite. + # Note that `chr_poly_D_at_A_to_U_ortho` is positive-definite if + # 1. `largest` == False, or + # 2. `largest` == True and `k` is even + # under the assumption that `A` has distinct eigenvalues. + # + # check if `chr_poly_D_at_A_to_U_ortho` is positive-definite or negative-definite + chr_poly_D_at_A_to_U_ortho_sign = -1 if (largest and (k % 2 == 1)) else +1 + chr_poly_D_at_A_to_U_ortho_L = torch.cholesky( + chr_poly_D_at_A_to_U_ortho_sign * chr_poly_D_at_A_to_U_ortho + ) + + # compute the gradient part in span(U) + res = _symeig_backward_complete_eigenspace( + D_grad, U_grad, A, D, U + ) + + # incorporate the Sylvester equation solution into the full gradient + # it resides in span(U_ortho) + res -= U_ortho.matmul( + chr_poly_D_at_A_to_U_ortho_sign * torch.cholesky_solve( + U_ortho_t.matmul(series_acc), + chr_poly_D_at_A_to_U_ortho_L + ) + ).matmul(Ut) + + return res + +def _symeig_backward(D_grad, U_grad, A, D, U, largest): + # if `U` is square, then the columns of `U` is a complete eigenspace + if U.size(-1) == U.size(-2): + return _symeig_backward_complete_eigenspace( + D_grad, U_grad, A, D, U + ) + else: + return _symeig_backward_partial_eigenspace( + D_grad, U_grad, A, D, U, largest + ) + +class LOBPCGAutogradFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, + A: Tensor, + k: Optional[int] = None, + B: Optional[Tensor] = None, + X: Optional[Tensor] = None, + n: Optional[int] = None, + iK: Optional[Tensor] = None, + niter: Optional[int] = None, + tol: Optional[float] = None, + largest: Optional[bool] = None, + method: Optional[str] = None, + tracker: Optional[None] = None, + ortho_iparams: Optional[Dict[str, int]] = None, + ortho_fparams: Optional[Dict[str, float]] = None, + ortho_bparams: Optional[Dict[str, bool]] = None + ) -> Tuple[Tensor, Tensor]: + + # makes sure that input is contiguous for efficiency. + # Note: autograd does not support dense gradients for sparse input yet. + A = A.contiguous() if (not A.is_sparse) else A + if B is not None: + B = B.contiguous() if (not B.is_sparse) else B + + D, U = _lobpcg( + A, k, B, X, + n, iK, niter, tol, largest, method, tracker, + ortho_iparams, ortho_fparams, ortho_bparams + ) + + ctx.save_for_backward(A, B, D, U, largest) + + return D, U + + @staticmethod + def backward(ctx, D_grad, U_grad): + A_grad = B_grad = None + grads = [None] * 14 + + A, B, D, U, largest = ctx.saved_tensors + + # lobpcg.backward has some limitations. Checks for unsupported input + if A.is_sparse or (B is not None and B.is_sparse and ctx.needs_input_grad[2]): + raise ValueError( + 'lobpcg.backward does not support sparse input yet.' + 'Note that lobpcg.forward does though.' + ) + if A.dtype in (torch.complex64, torch.complex128) or \ + B is not None and B.dtype in (torch.complex64, torch.complex128): + raise ValueError( + 'lobpcg.backward does not support complex input yet.' + 'Note that lobpcg.forward does though.' + ) + if B is not None: + raise ValueError( + 'lobpcg.backward does not support backward with B != I yet.' + ) + + if largest is None: + largest = True + + # symeig backward + if B is None: + A_grad = _symeig_backward( + D_grad, U_grad, A, D, U, largest + ) + + # A has index 0 + grads[0] = A_grad + # B has index 2 + grads[2] = B_grad + return tuple(grads) + + +def lobpcg(A: Tensor, + k: Optional[int] = None, + B: Optional[Tensor] = None, + X: Optional[Tensor] = None, + n: Optional[int] = None, + iK: Optional[Tensor] = None, + niter: Optional[int] = None, + tol: Optional[float] = None, + largest: Optional[bool] = None, + method: Optional[str] = None, + tracker: Optional[None] = None, + ortho_iparams: Optional[Dict[str, int]] = None, + ortho_fparams: Optional[Dict[str, float]] = None, + ortho_bparams: Optional[Dict[str, bool]] = None + ) -> Tuple[Tensor, Tensor]: """Find the k largest (or smallest) eigenvalues and the corresponding eigenvectors of a symmetric positive defined generalized @@ -53,6 +373,17 @@ def lobpcg(A, # type: Tensor not recommended but there exist cases where the usage of the basic method may be preferred. + .. warning:: The backward method does not support sparse and complex inputs. + It works only when `B` is not provided (i.e. `B == None`). + We are actively working on extensions, and the details of + the algorithms are going to be published promptly. + + .. warning:: While it is assumed that `A` is symmetric, `A.grad` is not. + To make sure that `A.grad` is symmetric, so that `A - t * A.grad` is symmetric + in first-order optimization routines, prior to running `lobpcg` + we do the following symmetrization map: `A -> (A + A.t()) / 2`. + The map is performed only when the `A` requires gradients. + Arguments: A (Tensor): the input tensor of size :math:`(*, m, m)` @@ -175,6 +506,51 @@ def lobpcg(A, # type: Tensor ortho_fparams=ortho_fparams, ortho_bparams=ortho_bparams) + if not torch._jit_internal.is_scripting(): + if A.requires_grad or (B is not None and B.requires_grad): + # While it is expected that `A` is symmetric, + # the `A_grad` might be not. Therefore we perform the trick below, + # so that `A_grad` becomes symmetric. + # The symmetrization is important for first-order optimization methods, + # so that (A - alpha * A_grad) is still a symmetric matrix. + # Same holds for `B`. + A_sym = (A + A.transpose(-2, -1)) / 2 + B_sym = (B + B.transpose(-2, -1)) / 2 if (B is not None) else None + + return LOBPCGAutogradFunction.apply( + A_sym, k, B_sym, X, n, iK, niter, tol, largest, + method, tracker, ortho_iparams, ortho_fparams, ortho_bparams + ) + else: + if A.requires_grad or (B is not None and B.requires_grad): + raise RuntimeError( + 'Script and require grads is not supported atm.' + 'If you just want to do the forward, use .detach()' + 'on A and B before calling into lobpcg' + ) + + return _lobpcg( + A, k, B, X, + n, iK, niter, tol, largest, method, tracker, + ortho_iparams, ortho_fparams, ortho_bparams + ) + +def _lobpcg(A: Tensor, + k: Optional[int] = None, + B: Optional[Tensor] = None, + X: Optional[Tensor] = None, + n: Optional[int] = None, + iK: Optional[Tensor] = None, + niter: Optional[int] = None, + tol: Optional[float] = None, + largest: Optional[bool] = None, + method: Optional[str] = None, + tracker: Optional[None] = None, + ortho_iparams: Optional[Dict[str, int]] = None, + ortho_fparams: Optional[Dict[str, float]] = None, + ortho_bparams: Optional[Dict[str, bool]] = None + ) -> Tuple[Tensor, Tensor]: + # A must be square: assert A.shape[-2] == A.shape[-1], A.shape if B is not None: diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 2a83aeca0de8..7caceff4a1d1 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -2342,6 +2342,18 @@ def callable(a, b) -> number Alias for :meth:`~Tensor.dim()` """) +add_docstr_all('nan_to_num', r""" +nan_to_num(nan=0.0, posinf=None, neginf=None) -> Tensor + +See :func:`torch.nan_to_num`. +""") + +add_docstr_all('nan_to_num_', r""" +nan_to_num_(nan=0.0, posinf=None, neginf=None) -> Tensor + +In-place version of :meth:`~Tensor.nan_to_num`. +""") + add_docstr_all('ne', r""" ne(other) -> Tensor @@ -3121,6 +3133,20 @@ def callable(a, b) -> number See :func:`torch.signbit` """) +add_docstr_all('sgn', + r""" +sgn() -> Tensor + +See :func:`torch.sgn` +""") + +add_docstr_all('sgn_', + r""" +sgn_() -> Tensor + +In-place version of :meth:`~Tensor.sgn` +""") + add_docstr_all('sin', r""" sin() -> Tensor diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index d0f6f8c92151..6c641c3df140 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -515,6 +515,12 @@ def merge_dicts(*dicts): For inputs of type `FloatTensor` or `DoubleTensor`, arguments :attr:`beta` and :attr:`alpha` must be real numbers, otherwise they should be integers +.. warning:: + This function is deprecated and may be removed in a future release. + It can be implemented using :func:`torch.outer` as + ``alpha * torch.outer(vec1, vec2) + beta * input`` when :attr:`beta` is not zero, + and as ``alpha * torch.outer(vec1, vec2)`` when :attr:`beta` is zero. + Args: input (Tensor): matrix to be added vec1 (Tensor): the first vector of the outer product @@ -2734,20 +2740,27 @@ def merge_dicts(*dicts): tensor([-1., 1., -1., -1.]) """.format(**common_args)) -add_docstr(torch.floor_divide, - r""" +add_docstr(torch.floor_divide, r""" floor_divide(input, other, *, out=None) -> Tensor -Return the division of the inputs rounded down to the nearest integer. See :func:`torch.div` -for type promotion and broadcasting rules. +.. warning:: + This function's name is a misnomer. It actually rounds the + quotient towards zero instead of taking its floor. This behavior + will be deprecated in a future PyTorch release. + +Computes :attr:`input` divided by :attr:`other`, elementwise, and rounds each +quotient towards zero. Equivalently, it truncates the quotient(s): .. math:: - \text{{out}}_i = \left\lfloor \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right\rfloor + \text{{out}}_i = \text{trunc} \left( \frac{{\text{{input}}_i}}{{\text{{other}}_i}} \right) """ + r""" + +Supports broadcasting to a common shape, type promotion, and integer and float inputs. + Args: - input (Tensor): the numerator tensor - other (Tensor or Scalar): the denominator + input (Tensor or Number): the dividend + other (Tensor or Number): the divisor Keyword args: {out} @@ -2982,13 +2995,6 @@ def merge_dicts(*dicts): add_docstr(torch.outer, r""" outer(input, vec2, *, out=None) -> Tensor -Alias of :func:`torch.ger`. -""") - -add_docstr(torch.ger, - r""" -ger(input, vec2, *, out=None) -> Tensor - Outer product of :attr:`input` and :attr:`vec2`. If :attr:`input` is a vector of size :math:`n` and :attr:`vec2` is a vector of size :math:`m`, then :attr:`out` must be a matrix of size :math:`(n \times m)`. @@ -3006,13 +3012,24 @@ def merge_dicts(*dicts): >>> v1 = torch.arange(1., 5.) >>> v2 = torch.arange(1., 4.) - >>> torch.ger(v1, v2) + >>> torch.outer(v1, v2) tensor([[ 1., 2., 3.], [ 2., 4., 6.], [ 3., 6., 9.], [ 4., 8., 12.]]) """) +add_docstr(torch.ger, + r""" +ger(input, vec2, *, out=None) -> Tensor + +Alias of :func:`torch.outer`. + +.. warning:: + This function is deprecated and will be removed in a future PyTorch release. + Use :func:`torch.outer` instead. +""") + add_docstr(torch.solve, r""" torch.solve(input, A, *, out=None) -> (Tensor, Tensor) @@ -4947,8 +4964,14 @@ def merge_dicts(*dicts): 1 is appended to its dimension for the purpose of the batched matrix multiple and removed after. The non-matrix (i.e. batch) dimensions are :ref:`broadcasted ` (and thus must be broadcastable). For example, if :attr:`input` is a + :math:`(j \times 1 \times n \times n)` tensor and :attr:`other` is a :math:`(k \times n \times n)` + tensor, :attr:`out` will be a :math:`(j \times k \times n \times n)` tensor. + + Note that the broadcasting logic only looks at the batch dimensions when determining if the inputs + are broadcastable, and not the matrix dimensions. For example, if :attr:`input` is a :math:`(j \times 1 \times n \times m)` tensor and :attr:`other` is a :math:`(k \times m \times p)` - tensor, :attr:`out` will be an :math:`(j \times k \times n \times p)` tensor. + tensor, these inputs are valid for broadcasting even though the final two dimensions (i.e. the + matrix dimensions) are different. :attr:`out` will be a :math:`(j \times k \times n \times p)` tensor. {tf32_note} @@ -5278,6 +5301,41 @@ def merge_dicts(*dicts): [ 8, 9]]) """) +add_docstr(torch.nan_to_num, + r""" +nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None) -> Tensor + +Replaces :literal:`NaN`, positive infinity, and negative infinity values in :attr:`input` +with the values specified by :attr:`nan`, :attr:`posinf`, and :attr:`neginf`, respectively. +By default, :literal:`NaN`s are replaced with zero, positive infinity is replaced with the +greatest finite value representable by :attr:`input`'s dtype, and negative infinity +is replaced with the least finite value representable by :attr:`input`'s dtype. + +Args: + {input} + nan (Number, optional): the value to replace :literal:`NaN`\s with. Default is zero. + posinf (Number, optional): if a Number, the value to replace positive infinity values with. + If None, positive infinity values are replaced with the greatest finite value representable by :attr:`input`'s dtype. + Default is None. + neginf (Number, optional): if a Number, the value to replace negative infinity values with. + If None, negative infinity values are replaced with the lowest finite value representable by :attr:`input`'s dtype. + Default is None. + +Keyword args: + {out} + +Example:: + + >>> x = torch.tensor([float('nan'), float('inf'), -float('inf'), 3.14]) + >>> torch.nan_to_num(x) + tensor([ 0.0000e+00, 3.4028e+38, -3.4028e+38, 3.1400e+00]) + >>> torch.nan_to_num(x, nan=2.0) + tensor([ 2.0000e+00, 3.4028e+38, -3.4028e+38, 3.1400e+00]) + >>> torch.nan_to_num(x, nan=2.0, posinf=1.0) + tensor([ 2.0000e+00, 1.0000e+00, -3.4028e+38, 3.1400e+00]) + +""".format(**common_args)) + add_docstr(torch.ne, r""" ne(input, other, *, out=None) -> Tensor @@ -5648,7 +5706,7 @@ def merge_dicts(*dicts): add_docstr(torch.poisson, r""" -poisson(input *, generator=None) -> Tensor +poisson(input, generator=None) -> Tensor Returns a tensor of the same size as :attr:`input` with each element sampled from a Poisson distribution with rate parameter given by the corresponding @@ -5847,7 +5905,7 @@ def merge_dicts(*dicts): add_docstr(torch.qr, r""" -qr(input, some=True, out=None) -> (Tensor, Tensor) +qr(input, some=True, *, out=None) -> (Tensor, Tensor) Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`, and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R` @@ -5875,6 +5933,8 @@ def merge_dicts(*dicts): batch dimensions consisting of matrices of dimension :math:`m \times n`. some (bool, optional): Set to ``True`` for reduced QR decomposition and ``False`` for complete QR decomposition. + +Keyword args: out (tuple, optional): tuple of `Q` and `R` tensors satisfying :code:`input = torch.matmul(Q, R)`. The dimensions of `Q` and `R` are :math:`(*, m, k)` and :math:`(*, k, n)` @@ -5911,7 +5971,7 @@ def merge_dicts(*dicts): add_docstr(torch.rad2deg, r""" -rad2deg(input, out=None) -> Tensor +rad2deg(input, *, out=None) -> Tensor Returns a new tensor with each of the elements of :attr:`input` converted from angles in radians to degrees. @@ -5934,7 +5994,7 @@ def merge_dicts(*dicts): add_docstr(torch.deg2rad, r""" -deg2rad(input, out=None) -> Tensor +deg2rad(input, *, out=None) -> Tensor Returns a new tensor with each of the elements of :attr:`input` converted from angles in degrees to radians. @@ -5991,7 +6051,7 @@ def merge_dicts(*dicts): add_docstr(torch.rand, r""" -rand(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor +rand(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor Returns a tensor filled with random numbers from a uniform distribution on the interval :math:`[0, 1)` @@ -6001,6 +6061,8 @@ def merge_dicts(*dicts): Args: size (int...): a sequence of integers defining the shape of the output tensor. Can be a variable number of arguments or a collection like a list or tuple. + +Keyword args: {out} {dtype} {layout} @@ -6018,7 +6080,7 @@ def merge_dicts(*dicts): add_docstr(torch.rand_like, r""" -rand_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor +rand_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor Returns a tensor with the same size as :attr:`input` that is filled with random numbers from a uniform distribution on the interval :math:`[0, 1)`. @@ -6027,6 +6089,8 @@ def merge_dicts(*dicts): Args: {input} + +Keyword args: {dtype} {layout} {device} @@ -6053,6 +6117,8 @@ def merge_dicts(*dicts): low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. high (int): One above the highest integer to be drawn from the distribution. size (tuple): a tuple defining the shape of the output tensor. + +Keyword args: {generator} {out} {dtype} @@ -6080,7 +6146,7 @@ def merge_dicts(*dicts): add_docstr(torch.randint_like, """ -randint_like(input, low=0, high, dtype=None, layout=torch.strided, device=None, requires_grad=False, \ +randint_like(input, low=0, high, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \ memory_format=torch.preserve_format) -> Tensor Returns a tensor with the same shape as Tensor :attr:`input` filled with @@ -6095,6 +6161,8 @@ def merge_dicts(*dicts): {input} low (int, optional): Lowest integer to be drawn from the distribution. Default: 0. high (int): One above the highest integer to be drawn from the distribution. + +Keyword args: {dtype} {layout} {device} @@ -6105,7 +6173,7 @@ def merge_dicts(*dicts): add_docstr(torch.randn, r""" -randn(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor +randn(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor Returns a tensor filled with random numbers from a normal distribution with mean `0` and variance `1` (also called the standard normal @@ -6119,6 +6187,8 @@ def merge_dicts(*dicts): Args: size (int...): a sequence of integers defining the shape of the output tensor. Can be a variable number of arguments or a collection like a list or tuple. + +Keyword args: {out} {dtype} {layout} @@ -6136,7 +6206,7 @@ def merge_dicts(*dicts): add_docstr(torch.randn_like, r""" -randn_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor +randn_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor Returns a tensor with the same size as :attr:`input` that is filled with random numbers from a normal distribution with mean 0 and variance 1. @@ -6145,6 +6215,8 @@ def merge_dicts(*dicts): Args: {input} + +Keyword args: {dtype} {layout} {device} @@ -6155,12 +6227,14 @@ def merge_dicts(*dicts): add_docstr(torch.randperm, r""" -randperm(n, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor +randperm(n, *, out=None, dtype=torch.int64, layout=torch.strided, device=None, requires_grad=False) -> LongTensor Returns a random permutation of integers from ``0`` to ``n - 1``. Args: n (int): the upper bound (exclusive) + +Keyword args: {out} dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. Default: ``torch.int64``. @@ -6176,7 +6250,7 @@ def merge_dicts(*dicts): add_docstr(torch.tensor, r""" -tensor(data, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor +tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False) -> Tensor Constructs a tensor with :attr:`data`. @@ -6197,6 +6271,8 @@ def merge_dicts(*dicts): Args: {data} + +Keyword args: {dtype} {device} {requires_grad} @@ -6227,7 +6303,7 @@ def merge_dicts(*dicts): add_docstr(torch.range, r""" -range(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor +range(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor Returns a 1-D tensor of size :math:`\left\lfloor \frac{\text{end} - \text{start}}{\text{step}} \right\rfloor + 1` with values from :attr:`start` to :attr:`end` with step :attr:`step`. Step is @@ -6244,6 +6320,8 @@ def merge_dicts(*dicts): start (float): the starting value for the set of points. Default: ``0``. end (float): the ending value for the set of points step (float): the gap between each pair of adjacent points. Default: ``1``. + +Keyword args: {out} {dtype} If `dtype` is not given, infer the data type from the other input arguments. If any of `start`, `end`, or `stop` are floating-point, the @@ -6264,7 +6342,7 @@ def merge_dicts(*dicts): add_docstr(torch.arange, r""" -arange(start=0, end, step=1, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor +arange(start=0, end, step=1, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor Returns a 1-D tensor of size :math:`\left\lceil \frac{\text{end} - \text{start}}{\text{step}} \right\rceil` with values from the interval ``[start, end)`` taken with common difference @@ -6281,6 +6359,8 @@ def merge_dicts(*dicts): start (Number): the starting value for the set of points. Default: ``0``. end (Number): the ending value for the set of points step (Number): the gap between each pair of adjacent points. Default: ``1``. + +Keyword args: {out} {dtype} If `dtype` is not given, infer the data type from the other input arguments. If any of `start`, `end`, or `stop` are floating-point, the @@ -6303,7 +6383,7 @@ def merge_dicts(*dicts): add_docstr(torch.remainder, r""" -remainder(input, other, out=None) -> Tensor +remainder(input, other, *, out=None) -> Tensor Computes the element-wise remainder of division. @@ -6317,6 +6397,8 @@ def merge_dicts(*dicts): input (Tensor): the dividend other (Tensor or float): the divisor that may be either a number or a Tensor of the same shape as the dividend + +Keyword args: {out} Example:: @@ -6334,7 +6416,7 @@ def merge_dicts(*dicts): add_docstr(torch.renorm, r""" -renorm(input, p, dim, maxnorm, out=None) -> Tensor +renorm(input, p, dim, maxnorm, *, out=None) -> Tensor Returns a tensor where each sub-tensor of :attr:`input` along dimension :attr:`dim` is normalized such that the `p`-norm of the sub-tensor is lower @@ -6347,6 +6429,8 @@ def merge_dicts(*dicts): p (float): the power for the norm computation dim (int): the dimension to slice over to get the sub-tensors maxnorm (float): the maximum norm to keep each sub-tensor under + +Keyword args: {out} Example:: @@ -6420,13 +6504,15 @@ def merge_dicts(*dicts): add_docstr(torch.round, r""" -round(input, out=None) -> Tensor +round(input, *, out=None) -> Tensor Returns a new tensor with each of the elements of :attr:`input` rounded to the closest integer. Args: {input} + +Keyword args: {out} Example:: @@ -6440,7 +6526,7 @@ def merge_dicts(*dicts): add_docstr(torch.rsqrt, r""" -rsqrt(input, out=None) -> Tensor +rsqrt(input, *, out=None) -> Tensor Returns a new tensor with the reciprocal of the square-root of each of the elements of :attr:`input`. @@ -6450,6 +6536,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -6533,7 +6621,7 @@ def merge_dicts(*dicts): add_docstr(torch.logit, r""" -logit(input, eps=None, out=None) -> Tensor +logit(input, eps=None, *, out=None) -> Tensor Returns a new tensor with the logit of the elements of :attr:`input`. :attr:`input` is clamped to [eps, 1 - eps] when eps is not None. @@ -6551,6 +6639,8 @@ def merge_dicts(*dicts): Args: {input} eps (float, optional): the epsilon for input clamp bound. Default: ``None`` + +Keyword args: {out} Example:: @@ -6564,7 +6654,7 @@ def merge_dicts(*dicts): add_docstr(torch.sign, r""" -sign(input, out=None) -> Tensor +sign(input, *, out=None) -> Tensor Returns a new tensor with the signs of the elements of :attr:`input`. @@ -6573,6 +6663,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -6603,9 +6695,34 @@ def merge_dicts(*dicts): tensor([ False, True, False, False]) """.format(**common_args)) +add_docstr(torch.sgn, + r""" +sgn(input, *, out=None) -> Tensor + +For complex tensors, this function returns a new tensor whose elemants have the same angle as that of the +elements of :attr:`input` and absolute value 1. For a non-complex tensor, this function +returns the signs of the elements of :attr:`input` (see :func:`torch.sign`). + +:math:`\text{out}_{i} = 0`, if :math:`|{\text{{input}}_i}| == 0` +:math:`\text{out}_{i} = \frac{{\text{{input}}_i}}{|{\text{{input}}_i}|}`, otherwise + +""" + r""" +Args: + {input} + +Keyword args: + {out} + +Example:: + + >>> x=torch.tensor([3+4j, 7-24j, 0, 1+2j]) + >>> x.sgn() + tensor([0.6000+0.8000j, 0.2800-0.9600j, 0.0000+0.0000j, 0.4472+0.8944j]) +""".format(**common_args)) + add_docstr(torch.sin, r""" -sin(input, out=None) -> Tensor +sin(input, *, out=None) -> Tensor Returns a new tensor with the sine of the elements of :attr:`input`. @@ -6614,6 +6731,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -6627,7 +6746,7 @@ def merge_dicts(*dicts): add_docstr(torch.sinh, r""" -sinh(input, out=None) -> Tensor +sinh(input, *, out=None) -> Tensor Returns a new tensor with the hyperbolic sine of the elements of :attr:`input`. @@ -6637,6 +6756,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -6650,7 +6771,7 @@ def merge_dicts(*dicts): add_docstr(torch.sort, r""" -sort(input, dim=-1, descending=False, out=None) -> (Tensor, LongTensor) +sort(input, dim=-1, descending=False, *, out=None) -> (Tensor, LongTensor) Sorts the elements of the :attr:`input` tensor along a given dimension in ascending order by value. @@ -6668,6 +6789,8 @@ def merge_dicts(*dicts): {input} dim (int, optional): the dimension to sort along descending (bool, optional): controls the sorting order (ascending or descending) + +Keyword args: out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can be optionally given to be used as output buffers @@ -6729,7 +6852,7 @@ def merge_dicts(*dicts): add_docstr(torch.sparse_coo_tensor, r""" -sparse_coo_tensor(indices, values, size=None, dtype=None, device=None, requires_grad=False) -> Tensor +sparse_coo_tensor(indices, values, size=None, *, dtype=None, device=None, requires_grad=False) -> Tensor Constructs a sparse tensors in COO(rdinate) format with non-zero elements at the given :attr:`indices` with the given :attr:`values`. A sparse tensor can be `uncoalesced`, in that case, there are duplicate @@ -6747,6 +6870,8 @@ def merge_dicts(*dicts): size (list, tuple, or :class:`torch.Size`, optional): Size of the sparse tensor. If not provided the size will be inferred as the minimum size big enough to hold all non-zero elements. + +Keyword args: dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. Default: if None, infers data type from :attr:`values`. device (:class:`torch.device`, optional): the desired device of returned tensor. @@ -6806,7 +6931,7 @@ def merge_dicts(*dicts): add_docstr(torch.sqrt, r""" -sqrt(input, out=None) -> Tensor +sqrt(input, *, out=None) -> Tensor Returns a new tensor with the square-root of the elements of :attr:`input`. @@ -6815,6 +6940,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -6828,12 +6955,14 @@ def merge_dicts(*dicts): add_docstr(torch.square, r""" -square(input, out=None) -> Tensor +square(input, *, out=None) -> Tensor Returns a new tensor with the square of the elements of :attr:`input`. Args: {input} + +Keyword args: {out} Example:: @@ -6847,7 +6976,7 @@ def merge_dicts(*dicts): add_docstr(torch.squeeze, r""" -squeeze(input, dim=None, out=None) -> Tensor +squeeze(input, dim=None, *, out=None) -> Tensor Returns a tensor with all the dimensions of :attr:`input` of size `1` removed. @@ -6871,6 +7000,8 @@ def merge_dicts(*dicts): {input} dim (int, optional): if given, the input will be squeezed only in this dimension + +Keyword args: {out} Example:: @@ -7027,12 +7158,14 @@ def merge_dicts(*dicts): add_docstr(torch.sum, r""" -sum(input, dtype=None) -> Tensor +sum(input, *, dtype=None) -> Tensor Returns the sum of all elements in the :attr:`input` tensor. Args: {input} + +Keyword args: {dtype} Example:: @@ -7043,7 +7176,7 @@ def merge_dicts(*dicts): >>> torch.sum(a) tensor(-0.5475) -.. function:: sum(input, dim, keepdim=False, dtype=None) -> Tensor +.. function:: sum(input, dim, keepdim=False, *, dtype=None) -> Tensor Returns the sum of each row of the :attr:`input` tensor in the given dimension :attr:`dim`. If :attr:`dim` is a list of dimensions, @@ -7055,6 +7188,8 @@ def merge_dicts(*dicts): {input} {dim} {keepdim} + +Keyword args: {dtype} Example:: @@ -7074,7 +7209,7 @@ def merge_dicts(*dicts): add_docstr(torch.nansum, r""" -nansum(input, dtype=None) -> Tensor +nansum(input, *, dtype=None) -> Tensor Returns the sum of all elements, treating Not a Numbers (NaNs) as zero. @@ -7090,7 +7225,7 @@ def merge_dicts(*dicts): >>> torch.nansum(a) tensor(7.) -.. function:: nansum(input, dim, keepdim=False, dtype=None) -> Tensor +.. function:: nansum(input, dim, keepdim=False, *, dtype=None) -> Tensor Returns the sum of each row of the :attr:`input` tensor in the given dimension :attr:`dim`, treating Not a Numbers (NaNs) as zero. @@ -7121,7 +7256,7 @@ def merge_dicts(*dicts): add_docstr(torch.svd, r""" -svd(input, some=True, compute_uv=True, out=None) -> (Tensor, Tensor, Tensor) +svd(input, some=True, compute_uv=True, *, out=None) -> (Tensor, Tensor, Tensor) This function returns a namedtuple ``(U, S, V)`` which is the singular value decomposition of a input real matrix or batches of real matrices :attr:`input` such that @@ -7163,6 +7298,8 @@ def merge_dicts(*dicts): batch dimensions consisting of :math:`m \times n` matrices. some (bool, optional): controls the shape of returned `U` and `V` compute_uv (bool, optional): option whether to compute `U` and `V` or not + +Keyword args: out (tuple, optional): the output tuple of tensors Example:: @@ -7197,7 +7334,7 @@ def merge_dicts(*dicts): add_docstr(torch.symeig, r""" -symeig(input, eigenvectors=False, upper=True, out=None) -> (Tensor, Tensor) +symeig(input, eigenvectors=False, upper=True, *, out=None) -> (Tensor, Tensor) This function returns eigenvalues and eigenvectors of a real symmetric matrix :attr:`input` or a batch of real symmetric matrices, @@ -7232,6 +7369,8 @@ def merge_dicts(*dicts): batch dimensions consisting of symmetric matrices. eigenvectors(boolean, optional): controls whether eigenvectors have to be computed upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region + +Keyword args: out (tuple, optional): the output tuple of (Tensor, Tensor) Returns: @@ -7484,7 +7623,7 @@ def merge_dicts(*dicts): add_docstr(torch.tan, r""" -tan(input, out=None) -> Tensor +tan(input, *, out=None) -> Tensor Returns a new tensor with the tangent of the elements of :attr:`input`. @@ -7493,6 +7632,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -7506,7 +7647,7 @@ def merge_dicts(*dicts): add_docstr(torch.tanh, r""" -tanh(input, out=None) -> Tensor +tanh(input, *, out=None) -> Tensor Returns a new tensor with the hyperbolic tangent of the elements of :attr:`input`. @@ -7516,6 +7657,8 @@ def merge_dicts(*dicts): """ + r""" Args: {input} + +Keyword args: {out} Example:: @@ -7529,7 +7672,7 @@ def merge_dicts(*dicts): add_docstr(torch.topk, r""" -topk(input, k, dim=None, largest=True, sorted=True, out=None) -> (Tensor, LongTensor) +topk(input, k, dim=None, largest=True, sorted=True, *, out=None) -> (Tensor, LongTensor) Returns the :attr:`k` largest elements of the given :attr:`input` tensor along a given dimension. @@ -7552,6 +7695,8 @@ def merge_dicts(*dicts): smallest elements sorted (bool, optional): controls whether to return the elements in sorted order + +Keyword args: out (tuple, optional): the output tuple of (Tensor, LongTensor) that can be optionally given to be used as output buffers @@ -7661,7 +7806,7 @@ def merge_dicts(*dicts): add_docstr(torch.tril, r""" -tril(input, diagonal=0, out=None) -> Tensor +tril(input, diagonal=0, *, out=None) -> Tensor Returns the lower triangular part of the matrix (2-D tensor) or batch of matrices :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0. @@ -7680,6 +7825,8 @@ def merge_dicts(*dicts): Args: {input} diagonal (int, optional): the diagonal to consider + +Keyword args: {out} Example:: @@ -7716,7 +7863,7 @@ def merge_dicts(*dicts): # as common args. add_docstr(torch.tril_indices, r""" -tril_indices(row, col, offset=0, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor +tril_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor Returns the indices of the lower triangular part of a :attr:`row`-by- :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row @@ -7743,6 +7890,8 @@ def merge_dicts(*dicts): col (``int``): number of columns in the 2-D matrix. offset (``int``): diagonal offset from the main diagonal. Default: if not provided, 0. + +Keyword args: dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. Default: if ``None``, ``torch.long``. {device} @@ -7767,7 +7916,7 @@ def merge_dicts(*dicts): add_docstr(torch.triu, r""" -triu(input, diagonal=0, out=None) -> Tensor +triu(input, diagonal=0, *, out=None) -> Tensor Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices :attr:`input`, the other elements of the result tensor :attr:`out` are set to 0. @@ -7786,6 +7935,8 @@ def merge_dicts(*dicts): Args: {input} diagonal (int, optional): the diagonal to consider + +Keyword args: {out} Example:: @@ -7830,7 +7981,7 @@ def merge_dicts(*dicts): # as common args. add_docstr(torch.triu_indices, r""" -triu_indices(row, col, offset=0, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor +triu_indices(row, col, offset=0, *, dtype=torch.long, device='cpu', layout=torch.strided) -> Tensor Returns the indices of the upper triangular part of a :attr:`row` by :attr:`col` matrix in a 2-by-N Tensor, where the first row contains row @@ -7857,6 +8008,8 @@ def merge_dicts(*dicts): col (``int``): number of columns in the 2-D matrix. offset (``int``): diagonal offset from the main diagonal. Default: if not provided, 0. + +Keyword args: dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor. Default: if ``None``, ``torch.long``. {device} @@ -7887,13 +8040,15 @@ def merge_dicts(*dicts): add_docstr(torch.trunc, r""" -trunc(input, out=None) -> Tensor +trunc(input, *, out=None) -> Tensor Returns a new tensor with the truncated integer values of the elements of :attr:`input`. Args: {input} + +Keyword args: {out} Example:: @@ -8044,7 +8199,7 @@ def merge_dicts(*dicts): add_docstr(torch.zeros, r""" -zeros(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor +zeros(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor Returns a tensor filled with the scalar value `0`, with the shape defined by the variable argument :attr:`size`. @@ -8052,6 +8207,8 @@ def merge_dicts(*dicts): Args: size (int...): a sequence of integers defining the shape of the output tensor. Can be a variable number of arguments or a collection like a list or tuple. + +Keyword args: {out} {dtype} {layout} @@ -8070,7 +8227,7 @@ def merge_dicts(*dicts): add_docstr(torch.zeros_like, r""" -zeros_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor +zeros_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor Returns a tensor filled with the scalar value `0`, with the same size as :attr:`input`. ``torch.zeros_like(input)`` is equivalent to @@ -8083,6 +8240,8 @@ def merge_dicts(*dicts): Args: {input} + +Keyword args: {dtype} {layout} {device} @@ -8099,7 +8258,7 @@ def merge_dicts(*dicts): add_docstr(torch.empty, r""" -empty(*size, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor +empty(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, pin_memory=False) -> Tensor Returns a tensor filled with uninitialized data. The shape of the tensor is defined by the variable argument :attr:`size`. @@ -8107,6 +8266,8 @@ def merge_dicts(*dicts): Args: size (int...): a sequence of integers defining the shape of the output tensor. Can be a variable number of arguments or a collection like a list or tuple. + +Keyword args: {out} {dtype} {layout} @@ -8126,7 +8287,7 @@ def merge_dicts(*dicts): add_docstr(torch.empty_like, r""" -empty_like(input, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor +empty_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor Returns an uninitialized tensor with the same size as :attr:`input`. ``torch.empty_like(input)`` is equivalent to @@ -8134,6 +8295,8 @@ def merge_dicts(*dicts): Args: {input} + +Keyword args: {dtype} {layout} {device} @@ -8149,7 +8312,7 @@ def merge_dicts(*dicts): add_docstr(torch.empty_strided, r""" -empty_strided(size, stride, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor +empty_strided(size, stride, *, dtype=None, layout=None, device=None, requires_grad=False, pin_memory=False) -> Tensor Returns a tensor filled with uninitialized data. The shape and strides of the tensor is defined by the variable argument :attr:`size` and :attr:`stride` respectively. @@ -8165,6 +8328,8 @@ def merge_dicts(*dicts): Args: size (tuple of ints): the shape of the output tensor stride (tuple of ints): the strides of the output tensor + +Keyword args: {dtype} {layout} {device} @@ -8193,6 +8358,8 @@ def merge_dicts(*dicts): size (int...): a list, tuple, or :class:`torch.Size` of integers defining the shape of the output tensor. fill_value (Scalar): the value to fill the output tensor with. + +Keyword args: {out} {dtype} {layout} @@ -8208,7 +8375,7 @@ def merge_dicts(*dicts): add_docstr(torch.full_like, """ -full_like(input, fill_value, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \ +full_like(input, fill_value, \\*, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \ memory_format=torch.preserve_format) -> Tensor Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`. @@ -8218,6 +8385,8 @@ def merge_dicts(*dicts): Args: {input} fill_value: the number to fill the output tensor with. + +Keyword args: {dtype} {layout} {device} @@ -8463,7 +8632,13 @@ def merge_dicts(*dicts): add_docstr(torch.fft, r""" fft(input, signal_ndim, normalized=False) -> Tensor -Complex-to-complex Discrete Fourier Transform +Complex-to-complex Discrete Fourier Transform. + +.. warning:: + The function :func:`torch.fft` is deprecated and will be removed in + PyTorch 1.8. Use the new :ref:`torch.fft ` module + functions, instead, by importing :ref:`torch.fft ` and + calling :func:`torch.fft.fft` or :func:`torch.fft.fftn`. This method computes the complex-to-complex discrete Fourier transform. Ignoring the batch dimensions, it computes the following expression: @@ -8563,11 +8738,16 @@ def merge_dicts(*dicts): """) -add_docstr(torch.ifft, - r""" +add_docstr(torch.ifft, r""" ifft(input, signal_ndim, normalized=False) -> Tensor -Complex-to-complex Inverse Discrete Fourier Transform +Complex-to-complex Inverse Discrete Fourier Transform. + +.. warning:: + The function :func:`torch.ifft` is deprecated and will be removed in a + future PyTorch release. Use the new :ref:`torch.fft ` + module functions, instead, by importing :ref:`torch.fft ` + and calling :func:`torch.fft.ifft` or :func:`torch.fft.ifftn`. This method computes the complex-to-complex inverse discrete Fourier transform. Ignoring the batch dimensions, it computes the following @@ -8648,11 +8828,17 @@ def merge_dicts(*dicts): """) -add_docstr(torch.rfft, - r""" +add_docstr(torch.rfft, r""" rfft(input, signal_ndim, normalized=False, onesided=True) -> Tensor -Real-to-complex Discrete Fourier Transform +Real-to-complex Discrete Fourier Transform. + +.. warning:: + The function :func:`torch.rfft` is deprecated and will be removed in a + future PyTorch release. Use the new :ref:`torch.fft ` + module functions, instead, by importing :ref:`torch.fft ` + and calling :func:`torch.fft.rfft` for one-sided output, or + :func:`torch.fft.fft` for two-sided output. This method computes the real-to-complex discrete Fourier transform. It is mathematically equivalent with :func:`~torch.fft` with differences only in @@ -8717,11 +8903,17 @@ def merge_dicts(*dicts): """) -add_docstr(torch.irfft, - r""" +add_docstr(torch.irfft, r""" irfft(input, signal_ndim, normalized=False, onesided=True, signal_sizes=None) -> Tensor -Complex-to-real Inverse Discrete Fourier Transform +Complex-to-real Inverse Discrete Fourier Transform. + +.. warning:: + The function :func:`torch.irfft` is deprecated and will be removed in a + future PyTorch release. Use the new :ref:`torch.fft ` + module functions, instead, by importing :ref:`torch.fft ` + and calling :func:`torch.fft.irfft` for one-sided input, or + :func:`torch.fft.ifft` for two-sided input. This method computes the complex-to-real inverse discrete Fourier transform. It is mathematically equivalent with :func:`ifft` with differences only in @@ -8820,7 +9012,7 @@ def merge_dicts(*dicts): add_docstr(torch.hann_window, """ -hann_window(window_length, periodic=True, dtype=None, \ +hann_window(window_length, periodic=True, *, dtype=None, \ layout=torch.strided, device=None, requires_grad=False) -> Tensor """ + r""" Hann window function. @@ -8847,6 +9039,8 @@ def merge_dicts(*dicts): window_length (int): the size of returned window periodic (bool, optional): If True, returns a window to be used as periodic function. If False, return a symmetric window. + +Keyword args: {dtype} Only floating point types are supported. layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only ``torch.strided`` (dense layout) is supported. @@ -8861,7 +9055,7 @@ def merge_dicts(*dicts): add_docstr(torch.hamming_window, """ -hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, dtype=None, \ +hamming_window(window_length, periodic=True, alpha=0.54, beta=0.46, *, dtype=None, \ layout=torch.strided, device=None, requires_grad=False) -> Tensor """ + r""" Hamming window function. @@ -8892,6 +9086,8 @@ def merge_dicts(*dicts): function. If False, return a symmetric window. alpha (float, optional): The coefficient :math:`\alpha` in the equation above beta (float, optional): The coefficient :math:`\beta` in the equation above + +Keyword args: {dtype} Only floating point types are supported. layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only ``torch.strided`` (dense layout) is supported. @@ -8906,7 +9102,7 @@ def merge_dicts(*dicts): add_docstr(torch.bartlett_window, """ -bartlett_window(window_length, periodic=True, dtype=None, \ +bartlett_window(window_length, periodic=True, *, dtype=None, \ layout=torch.strided, device=None, requires_grad=False) -> Tensor """ + r""" Bartlett window function. @@ -8935,6 +9131,8 @@ def merge_dicts(*dicts): window_length (int): the size of returned window periodic (bool, optional): If True, returns a window to be used as periodic function. If False, return a symmetric window. + +Keyword args: {dtype} Only floating point types are supported. layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only ``torch.strided`` (dense layout) is supported. @@ -8949,7 +9147,7 @@ def merge_dicts(*dicts): add_docstr(torch.blackman_window, """ -blackman_window(window_length, periodic=True, dtype=None, \ +blackman_window(window_length, periodic=True, *, dtype=None, \ layout=torch.strided, device=None, requires_grad=False) -> Tensor """ + r""" Blackman window function. @@ -8975,6 +9173,8 @@ def merge_dicts(*dicts): window_length (int): the size of returned window periodic (bool, optional): If True, returns a window to be used as periodic function. If False, return a symmetric window. + +Keyword args: {dtype} Only floating point types are supported. layout (:class:`torch.layout`, optional): the desired layout of returned window tensor. Only ``torch.strided`` (dense layout) is supported. @@ -9001,7 +9201,7 @@ def merge_dicts(*dicts): out_i = I_0 \left( \beta \sqrt{1 - \left( {\frac{i - N/2}{N/2}} \right) ^2 } \right) / I_0( \beta ) Calling ``torch.kaiser_window(L, B, periodic=True)`` is equivalent to calling -``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. +``torch.kaiser_window(L + 1, B, periodic=False)[:-1])``. The :attr:`periodic` argument is intended as a helpful shorthand to produce a periodic window as input to functions like :func:`torch.stft`. @@ -9404,7 +9604,7 @@ def merge_dicts(*dicts): add_docstr(torch.searchsorted, r""" -searchsorted(sorted_sequence, values, out_int32=False, right=False, out=None) -> Tensor +searchsorted(sorted_sequence, values, *, out_int32=False, right=False, out=None) -> Tensor Find the indices from the *innermost* dimension of :attr:`sorted_sequence` such that, if the corresponding values in :attr:`values` were inserted before the indices, the order of the @@ -9422,21 +9622,23 @@ def merge_dicts(*dicts): - *returned index satisfies* * - 1-D - False - - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]`` + - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]`` * - 1-D - True - - ``sorted_sequence[i-1] < values[m][n]...[l][x] <= sorted_sequence[i]`` + - ``sorted_sequence[i-1] <= values[m][n]...[l][x] < sorted_sequence[i]`` * - N-D - False - - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]`` + - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]`` * - N-D - True - - ``sorted_sequence[m][n]...[l][i-1] < values[m][n]...[l][x] <= sorted_sequence[m][n]...[l][i]`` + - ``sorted_sequence[m][n]...[l][i-1] <= values[m][n]...[l][x] < sorted_sequence[m][n]...[l][i]`` Args: sorted_sequence (Tensor): N-D or 1-D tensor, containing monotonically increasing sequence on the *innermost* dimension. values (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s). + +Keyword args: out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise. Default value is False, i.e. default output data type is torch.int64. right (bool, optional): if False, return the first suitable location that is found. If True, return the @@ -9479,7 +9681,7 @@ def merge_dicts(*dicts): add_docstr(torch.bucketize, r""" -bucketize(input, boundaries, out_int32=False, right=False, out=None) -> Tensor +bucketize(input, boundaries, *, out_int32=False, right=False, out=None) -> Tensor Returns the indices of the buckets to which each value in the :attr:`input` belongs, where the boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size @@ -9493,13 +9695,15 @@ def merge_dicts(*dicts): * - :attr:`right` - *returned index satisfies* * - False - - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]`` - * - True - ``boundaries[i-1] < input[m][n]...[l][x] <= boundaries[i]`` + * - True + - ``boundaries[i-1] <= input[m][n]...[l][x] < boundaries[i]`` Args: input (Tensor or Scalar): N-D tensor or a Scalar containing the search value(s). boundaries (Tensor): 1-D tensor, must contain a monotonically increasing sequence. + +Keyword args: out_int32 (bool, optional): indicate the output data type. torch.int32 if True, torch.int64 otherwise. Default value is False, i.e. default output data type is torch.int64. right (bool, optional): if False, return the first suitable location that is found. If True, return the diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py index 2a1d0ef55fd9..58e780c87d1b 100644 --- a/torch/autograd/functional.py +++ b/torch/autograd/functional.py @@ -381,15 +381,15 @@ def jacobian(func, inputs, create_graph=False, strict=False): Defaults to ``False``. Returns: - Jacobian (Tensor or nested tuple of Tensors): if there are a single - input and output, this will be a single Tensor containing the - Jacobian for the linearized inputs and output. If one of the two is - a tuple, then the Jacobian will be a tuple of Tensors. If both of - them are tuples, then the Jacobian will be a tuple of tuple of - Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the - ``i``\th output and ``j``\th input and will have as size the - concatenation of the sizes of the corresponding output and the - corresponding input. + Jacobian (Tensor or nested tuple of Tensors): if there is a single + input and output, this will be a single Tensor containing the + Jacobian for the linearized inputs and output. If one of the two is + a tuple, then the Jacobian will be a tuple of Tensors. If both of + them are tuples, then the Jacobian will be a tuple of tuple of + Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the + ``i``\th output and ``j``\th input and will have as size the + concatenation of the sizes of the corresponding output and the + corresponding input. Example: @@ -476,12 +476,12 @@ def hessian(func, inputs, create_graph=False, strict=False): Defaults to ``False``. Returns: - Hessian (Tensor or a tuple of tuple of Tensors) if there are a single input, - this will be a single Tensor containing the Hessian for the input. - If it is a tuple, then the Hessian will be a tuple of tuples where - ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input - and ``j``\th input with size the sum of the size of the ``i``\th input plus - the size of the ``j``\th input. + Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input, + this will be a single Tensor containing the Hessian for the input. + If it is a tuple, then the Hessian will be a tuple of tuples where + ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input + and ``j``\th input with size the sum of the size of the ``i``\th input plus + the size of the ``j``\th input. Example: @@ -660,7 +660,9 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False): hvp for said inputs, which is the expected mathematical value. Defaults to ``False``. Returns: - func_output (tuple of Tensors or Tensor): output of ``func(inputs)`` + output (tuple): tuple with: + func_output (tuple of Tensors or Tensor): output of ``func(inputs)`` + hvp (tuple of Tensors or Tensor): result of the dot product with the same shape as the inputs. diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py index 4bcc3be1d85b..bbd96e941a54 100644 --- a/torch/autograd/grad_mode.py +++ b/torch/autograd/grad_mode.py @@ -22,7 +22,7 @@ def __call__(self, func: F) -> F: @functools.wraps(func) def decorate_context(*args, **kwargs): - with self: + with self.__class__(): return func(*args, **kwargs) return cast(F, decorate_context) @@ -33,7 +33,7 @@ def generator_context(*args, **kwargs): gen = func(*args, **kwargs) while True: try: - with self: + with self.__class__(): x = next(gen) yield x except StopIteration: diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 94b1aae844f1..8d33be090b27 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -105,6 +105,35 @@ def populate_cpu_children(self): self._cpu_children_populated = True + def set_backward_stacktraces(self): + self.populate_cpu_children() + + def bw_parent(evt): + if evt is None: + return None + elif evt.scope == 1: + return evt + else: + return bw_parent(evt.cpu_parent) + + fwd_stacks = {} + for evt in self: + if bw_parent(evt) is None: + t = (evt.sequence_nr, evt.thread) + if t not in fwd_stacks: + fwd_stacks[t] = evt.stack + + for evt in self: + p = bw_parent(evt) + if p is not None: + assert p.fwd_thread is not None + t = (p.sequence_nr, p.fwd_thread) + if t in fwd_stacks: + evt.stack = fwd_stacks[t] + else: + evt.stack = [] + + @property def self_cpu_time_total(self): return sum([event.self_cpu_time_total for event in self]) @@ -208,14 +237,17 @@ def export_chrome_trace(self, path): f.truncate() f.write("]") - def key_averages(self, group_by_input_shapes=False): + def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0): """Averages all function events over their keys. - @param group_by_input_shapes The key would become - (event name, input dimensions) rather than just event name. - This is useful to see which dimensionality contributes to the runtime - the most and may help with dimension specific optimizations or - choosing best candidates for quantization (aka fitting a roof line) + Arguments: + group_by_input_shapes: group entries by + (event name, input shapes) rather than just event name. + This is useful to see which input shapes contribute to the runtime + the most and may help with size-specific optimizations or + choosing the best candidates for quantization (aka fitting a roof line) + + group_by_stack_n: group by top n stack trace entries Returns: An EventList containing FunctionEventAvg objects. @@ -223,14 +255,22 @@ def key_averages(self, group_by_input_shapes=False): self.populate_cpu_children() stats = defaultdict(FunctionEventAvg) - def get_key(event, group_by_input_shapes): - if not group_by_input_shapes: - return (event.key, event.node_id) - return (event.key, str(event.input_shapes), event.node_id) + def get_key(event, group_by_input_shapes, group_by_stack_n): + key = [str(event.key), str(event.node_id)] + if group_by_input_shapes: + key.append(str(event.input_shapes)) + if group_by_stack_n > 0: + key += event.stack[:group_by_stack_n] + return tuple(key) for evt in self: - stats[get_key(evt, group_by_input_shapes)].add( - evt, group_by_input_shapes) - return EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory) + stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt) + + avg_list = EventList(stats.values(), use_cuda=self._use_cuda, profile_memory=self._profile_memory) + for evt in avg_list: + evt.stack = evt.stack[:group_by_stack_n] + if not group_by_input_shapes: + evt.input_shapes = "" + return avg_list def total_average(self): """Averages all events. @@ -274,8 +314,11 @@ class profile(object): profile_memory (bool, optional): Whether to report memory usage, default: ``False`` + with_stack (bool, optional): record source information (file and line number) for the ops + .. warning: - Enabling memory profiling incurs additional profiler overhead + Enabling memory profiling or source attribution incurs additional profiler + overhead .. warning: This context managers should not be called recursively, i.e. no nested @@ -311,7 +354,8 @@ def __init__( enabled=True, use_cuda=False, record_shapes=False, - profile_memory=False): + profile_memory=False, + with_stack=False): self.enabled = enabled self.use_cuda = use_cuda self.function_events = None @@ -320,6 +364,7 @@ def __init__( self.entered = False self.record_shapes = record_shapes self.profile_memory = profile_memory + self.with_stack = with_stack def __enter__(self): if not self.enabled: @@ -330,7 +375,11 @@ def __enter__(self): profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \ else torch.autograd.ProfilerState.CPU - config = torch.autograd.ProfilerConfig(profiler_kind, self.record_shapes, self.profile_memory) + config = torch.autograd.ProfilerConfig( + profiler_kind, + self.record_shapes, + self.profile_memory, + self.with_stack) torch.autograd._enable_profiler(config) return self @@ -339,9 +388,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): return records = torch.autograd._disable_profiler() self.function_events = EventList( - parse_cpu_trace(records), + parse_event_records(records), use_cuda=self.use_cuda, profile_memory=self.profile_memory) + if self.with_stack: + self.function_events.set_backward_stacktraces() return False def __repr__(self): @@ -373,9 +424,9 @@ def export_chrome_trace(self, path): return self.function_events.export_chrome_trace(path) export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__ - def key_averages(self, group_by_input_shape=False): + def key_averages(self, group_by_input_shape=False, group_by_stack_n=0): self._check_finish() - return self.function_events.key_averages(group_by_input_shape) + return self.function_events.key_averages(group_by_input_shape, group_by_stack_n) key_averages.__doc__ = EventList.key_averages.__doc__ def total_average(self): @@ -568,8 +619,8 @@ def __enter__(self): torch.autograd.ProfilerConfig( torch.autograd.ProfilerState.NVTX, self.record_shapes, - False - ) + False, + False) ) return self @@ -639,6 +690,7 @@ class FormattedTimesMixin(object): cpu_time_total_str = attr_formatter('cpu_time_total') cuda_time_total_str = attr_formatter('cuda_time_total') self_cpu_time_total_str = attr_formatter('self_cpu_time_total') + self_cuda_time_total_str = attr_formatter('self_cuda_time_total') @property def cpu_time(self): @@ -664,19 +716,22 @@ def elapsed_us(self): class FunctionEvent(FormattedTimesMixin): """Profiling information about a single function.""" def __init__( - self, id, node_id, name, thread, cpu_start, cpu_end, input_shapes=None, - cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, is_remote=True, - sequence_nr=-1): + self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None, + stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, + is_remote=True, sequence_nr=-1): self.id = id self.node_id = node_id self.name = name self.cpu_interval = Interval(cpu_start, cpu_end) self.thread = thread + self.fwd_thread = fwd_thread self.kernels = [] self.count = 1 self.cpu_children = [] self.cpu_parent = None self.input_shapes = input_shapes + self.stack = stack + self.scope = scope self.cpu_memory_usage = cpu_memory_usage self.cuda_memory_usage = cuda_memory_usage self.is_async = is_async @@ -735,6 +790,11 @@ def self_cpu_time_total(self): def cuda_time_total(self): return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + @property + def self_cuda_time_total(self): + return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \ + sum([child.cuda_time_total for child in self.cpu_children]) + @property def cpu_time_total(self): return self.cpu_interval.elapsed_us() @@ -778,7 +838,10 @@ def __init__(self): self.cpu_time_total = 0 self.cuda_time_total = 0 self.self_cpu_time_total = 0 + self.self_cuda_time_total = 0 self.input_shapes = None + self.stack = None + self.scope = None self.cpu_memory_usage = 0 self.cuda_memory_usage = 0 self.self_cpu_memory_usage = 0 @@ -786,7 +849,7 @@ def __init__(self): self.cpu_children = None self.cpu_parent = None - def add(self, other, group_by_input_shapes=False): + def add(self, other): if self.key is None: # First function being recorded as part of FunctionEventAvg, propagate # fields. @@ -796,18 +859,17 @@ def add(self, other, group_by_input_shapes=False): self.is_remote = other.is_remote self.cpu_parent = other.cpu_parent self.cpu_children = other.cpu_children - if group_by_input_shapes: - self.input_shapes = other.input_shapes - assert ( - not group_by_input_shapes or - other.input_shapes == self.input_shapes - ) + self.input_shapes = other.input_shapes + self.stack = other.stack + self.scope = other.scope + assert isinstance(other, (FunctionEvent, FunctionEventAvg)) assert other.key == self.key self.cpu_time_total += other.cpu_time_total self.cuda_time_total += other.cuda_time_total self.self_cpu_time_total += other.self_cpu_time_total + self.self_cuda_time_total += other.self_cuda_time_total self.cpu_memory_usage += other.cpu_memory_usage self.cuda_memory_usage += other.cuda_memory_usage self.self_cpu_memory_usage += other.self_cpu_memory_usage @@ -821,11 +883,12 @@ def __iadd__(self, other): def __repr__(self): return ( ' ' - 'cpu_memory_usage={} cuda_memory_usage={}'.format( + ' self_cuda_time={} cuda_time={} input_shapes={} ' + 'cpu_memory_usage={} cuda_memory_usage={}>'.format( self.key, self.self_cpu_time_total_str, self.cpu_time_str, + self.self_cuda_time_total_str, self.cuda_time_str, str(self.input_shapes), self.cpu_memory_usage, @@ -845,14 +908,10 @@ def __missing__(self, key): self[key] = torch._C._demangle(key) if len(key) > 1 else key return self[key] - -################################################################################ -# CPU checkpoints - -def parse_cpu_trace(thread_records): +def parse_event_records(thread_records): def get_record_key(record): """ - Returns a tuple to be used by parse_cpu_trace for correlating start and + Returns a tuple to be used by parse_event_records for correlating start and end records. """ return (record.handle(), record.node_id()) @@ -873,6 +932,17 @@ def get_record_key(record): "aten::_version", ] + def filter_stack_entry(entry): + filtered_entries = [ + ("autograd/__init__", "_make_grads"), + ("autograd/__init__", "backward"), + ("torch/tensor", "backward"), + ("_internal/common_utils", "prof_callable"), + ("_internal/common_utils", "prof_func_call"), + ("_internal/common_utils", "prof_meth_call"), + ] + return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries]) + # cuda start events and the overall profiler start event don't happen # at exactly the same time because we need to record an event on each device # and each record takes ~4us. So we adjust here by the difference @@ -951,7 +1021,10 @@ def adjusted_time(cuda_record, cuda_records_map): thread=start.thread_id(), cpu_start=start_record.cpu_elapsed_us(start), cpu_end=start_record.cpu_elapsed_us(record), + fwd_thread=start.fwd_thread_id(), input_shapes=start.shapes(), + stack=[entry for entry in start.stack() if filter_stack_entry(entry)], + scope=start.scope(), cpu_memory_usage=cpu_memory_usage, cuda_memory_usage=cuda_memory_usage, is_async=is_async, @@ -1088,22 +1161,37 @@ def build_table( ), use_cuda=use_cuda, profile_memory=profile_memory) has_input_shapes = any( - [event.input_shapes is not None for event in events]) + [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events]) + name_column_width = max([len(evt.key) for evt in events]) + 4 - DEFAULT_COLUMN_WIDTH = 15 - SHAPES_COLUMN_WIDTH = 45 + + DEFAULT_COLUMN_WIDTH = 12 + + shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4 + shapes_column_width = min(shapes_column_width, 45) + + src_column_width = None + stacks = [] + for evt in events: + if evt.stack is not None and len(evt.stack) > 0: + stacks.append(evt.stack) + has_stack = len(stacks) > 0 + if has_stack: + src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4 + src_column_width = min(src_column_width, 75) headers = [ 'Name', - 'Self CPU total %', - 'Self CPU total', + 'Self CPU %', + 'Self CPU', 'CPU total %', 'CPU total', 'CPU time avg', ] if use_cuda: headers.extend([ - 'CUDA total %', + 'Self CUDA', + 'Self CUDA %', 'CUDA total', 'CUDA time avg', ]) @@ -1118,7 +1206,7 @@ def build_table( 'Self CUDA Mem', ]) headers.append( - 'Number of Calls' + '# of Calls' ) # Only append Node ID if any event has a valid (>= 0) Node ID append_node_id = any([evt.node_id != -1 for evt in events]) @@ -1130,10 +1218,11 @@ def build_table( row_format = [""] header_sep = [""] line_length = [-SPACING_SIZE] + MAX_STACK_ENTRY = 5 - def add_column(padding): - row_format[0] += '{: <' + str(padding) + '} ' - header_sep[0] += '-' * padding + ' ' + def add_column(padding, text_dir='>'): + row_format[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE) + header_sep[0] += '-' * padding + (' ' * SPACING_SIZE) line_length[0] += padding + SPACING_SIZE add_column(name_column_width) @@ -1142,7 +1231,11 @@ def add_column(padding): if has_input_shapes: headers.append('Input Shapes') - add_column(SHAPES_COLUMN_WIDTH) + add_column(shapes_column_width) + + if has_stack: + headers.append('Source Location') + add_column(src_column_width, text_dir='<') row_format = row_format[0] header_sep = header_sep[0] @@ -1157,7 +1250,7 @@ def append(s): result.append('\n') # Yes, newline after the end as well self_cpu_time_total = sum([event.self_cpu_time_total for event in events]) - cuda_time_total = sum([evt.cuda_time_total for evt in events]) + cuda_time_total = sum([evt.self_cuda_time_total for evt in events]) # Actual printing if header is not None: append('=' * line_length) @@ -1191,8 +1284,9 @@ def append(s): ] if use_cuda: row_values.extend([ + evt.self_cuda_time_total_str, # CUDA time total % - format_time_share(evt.cuda_time_total, cuda_time_total), + format_time_share(evt.self_cuda_time_total, cuda_time_total), evt.cuda_time_total_str, evt.cuda_time_str, # Cuda time avg ]) @@ -1217,9 +1311,21 @@ def append(s): if append_node_id: row_values.append(evt.node_id) if has_input_shapes: - row_values.append(str(evt.input_shapes)[:SHAPES_COLUMN_WIDTH]) + row_values.append(str(evt.input_shapes)[:shapes_column_width]) + if has_stack: + src_field = "" + if len(evt.stack) > 0: + src_field = evt.stack[0][:src_column_width] + row_values.append(src_field) append(row_format.format(*row_values)) + if has_stack: + empty_headers = [""] * (len(headers) - 1) + for entry in evt.stack[1:MAX_STACK_ENTRY]: + append(row_format.format(*(empty_headers + [entry[:src_column_width]]))) + empty_headers.append("") + append(row_format.format(*empty_headers)) + append(header_sep) append("Self CPU time total: {}".format(format_time(self_cpu_time_total))) if use_cuda: diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp index 6969ac0449c0..f7e48c3b682d 100644 --- a/torch/csrc/DynamicTypes.cpp +++ b/torch/csrc/DynamicTypes.cpp @@ -61,9 +61,8 @@ PyTypeObject* getPyTypeObject( const at::Storage& storage, const caffe2::TypeMeta& dtype) { at::ScalarType scalarType = at::typeMetaToScalarType(dtype); - at::TensorOptions options = at::TensorOptions(storage.device_type()).dtype(scalarType); auto attype = &at::getDeprecatedTypeProperties( - at::dispatchKeyToBackend(at::computeDispatchKey(options)), + at::dispatchKeyToBackend(c10::computeDispatchKey(scalarType, c10::nullopt, storage.device_type())), scalarType); auto it = attype_to_py_storage_type.find(attype); if (it != attype_to_py_storage_type.end()) { diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ed4aa21a8f76..6f61b5e0a2d9 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -61,6 +61,12 @@ #endif #endif +#if (defined(_WIN32) || defined(_WIN64) || defined(FBCODE_CAFFE2) || defined(C10_MOBILE)) +#define NVALGRIND +#else +#include +#endif + #define WITH_NUMPY_IMPORT_ARRAY #include @@ -127,6 +133,7 @@ static PyObject * THPModule_initExtension(PyObject *_unused, PyObject *shm_manag THPByteStorage_postInit(module); THPBoolStorage_postInit(module); THPQUInt8Storage_postInit(module); + THPQUInt4x2Storage_postInit(module); THPQInt8Storage_postInit(module); THPQInt32Storage_postInit(module); THPBFloat16Storage_postInit(module); @@ -527,12 +534,12 @@ PyObject *THPModule_setQEngine(PyObject */* unused */, PyObject *arg) Py_RETURN_NONE; } -PyObject *THPModule_qEngine(PyObject */* unused */) +PyObject *THPModule_qEngine(PyObject *_unused, PyObject *noargs) { return THPUtils_packInt64(static_cast(at::globalContext().qEngine())); } -PyObject *THPModule_supportedQEngines(PyObject */* unused */) +PyObject *THPModule_supportedQEngines(PyObject *_unused, PyObject *noargs) { auto qengines = at::globalContext().supportedQEngines(); auto list = THPObjectPtr(PyList_New(qengines.size())); @@ -546,7 +553,7 @@ PyObject *THPModule_supportedQEngines(PyObject */* unused */) return list.release(); } -PyObject *THPModule_isEnabledXNNPACK(PyObject * /* unused */) +PyObject *THPModule_isEnabledXNNPACK(PyObject *_unused, PyObject *noargs) { if (at::globalContext().isXNNPACKAvailable()) Py_RETURN_TRUE; else Py_RETURN_FALSE; @@ -567,52 +574,52 @@ static PyObject * THPModule_vmapmode_decrement_nesting(PyObject* _unused, PyObje //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays) static PyMethodDef TorchMethods[] = { {"_initExtension", (PyCFunction)THPModule_initExtension, METH_O, nullptr}, - {"_autograd_init", (PyCFunction)THPAutograd_initExtension, METH_NOARGS, nullptr}, + {"_autograd_init", THPAutograd_initExtension, METH_NOARGS, nullptr}, {"_add_docstr", (PyCFunction)THPModule_addDocStr, METH_VARARGS, nullptr}, {"_init_names", (PyCFunction)THPModule_initNames, METH_O, nullptr}, - {"_has_distributed",(PyCFunction)THPModule_hasDistributed, METH_NOARGS, nullptr}, + {"_has_distributed",THPModule_hasDistributed, METH_NOARGS, nullptr}, {"_set_default_tensor_type", (PyCFunction)THPModule_setDefaultTensorType, METH_O, nullptr}, {"_set_default_dtype", (PyCFunction)THPModule_setDefaultDtype, METH_O, nullptr}, {"_infer_size", (PyCFunction)THPModule_inferSize, METH_VARARGS, nullptr}, {"_crash_if_csrc_asan", (PyCFunction)THPModule_crashIfCsrcASAN, METH_O, nullptr}, {"_crash_if_csrc_ubsan", (PyCFunction)THPModule_crashIfCsrcUBSAN, METH_O, nullptr}, {"_crash_if_aten_asan", (PyCFunction)THPModule_crashIfATenASAN, METH_O, nullptr}, - {"_show_config", (PyCFunction)THPModule_showConfig, METH_NOARGS, nullptr}, - {"_parallel_info", (PyCFunction)THPModule_parallelInfo, METH_NOARGS, nullptr}, + {"_show_config", THPModule_showConfig, METH_NOARGS, nullptr}, + {"_parallel_info", THPModule_parallelInfo, METH_NOARGS, nullptr}, {"_set_backcompat_broadcast_warn", (PyCFunction)THPModule_setBackcompatBroadcastWarn, METH_O, nullptr}, - {"_get_backcompat_broadcast_warn", (PyCFunction)THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr}, + {"_get_backcompat_broadcast_warn", THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr}, {"_set_backcompat_keepdim_warn", (PyCFunction)THPModule_setBackcompatKeepdimWarn, METH_O, nullptr}, - {"_get_backcompat_keepdim_warn", (PyCFunction)THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr}, - {"get_num_threads", (PyCFunction)THPModule_getNumThreads, METH_NOARGS, nullptr}, + {"_get_backcompat_keepdim_warn", THPModule_getBackcompatKeepdimWarn, METH_NOARGS, nullptr}, + {"get_num_threads", THPModule_getNumThreads, METH_NOARGS, nullptr}, {"set_num_threads", (PyCFunction)THPModule_setNumThreads, METH_O, nullptr}, - {"get_num_interop_threads", (PyCFunction)THPModule_getNumInteropThreads, METH_NOARGS, nullptr}, + {"get_num_interop_threads", THPModule_getNumInteropThreads, METH_NOARGS, nullptr}, {"set_num_interop_threads", (PyCFunction)THPModule_setNumInteropThreads, METH_O, nullptr}, - {"_get_cudnn_enabled", (PyCFunction)THPModule_userEnabledCuDNN, METH_NOARGS, nullptr}, + {"_get_cudnn_enabled", THPModule_userEnabledCuDNN, METH_NOARGS, nullptr}, {"_set_cudnn_enabled", (PyCFunction)THPModule_setUserEnabledCuDNN, METH_O, nullptr}, - {"_get_mkldnn_enabled", (PyCFunction)THPModule_userEnabledMkldnn, METH_NOARGS, nullptr}, + {"_get_mkldnn_enabled", THPModule_userEnabledMkldnn, METH_NOARGS, nullptr}, {"_set_mkldnn_enabled", (PyCFunction)THPModule_setUserEnabledMkldnn, METH_O, nullptr}, - {"_get_cudnn_allow_tf32", (PyCFunction)THPModule_allowTF32CuDNN, METH_NOARGS, nullptr}, + {"_get_cudnn_allow_tf32", THPModule_allowTF32CuDNN, METH_NOARGS, nullptr}, {"_set_cudnn_allow_tf32", (PyCFunction)THPModule_setAllowTF32CuDNN, METH_O, nullptr}, - {"_get_cudnn_benchmark", (PyCFunction)THPModule_benchmarkCuDNN, METH_NOARGS, nullptr}, + {"_get_cudnn_benchmark", THPModule_benchmarkCuDNN, METH_NOARGS, nullptr}, {"_set_cudnn_benchmark", (PyCFunction)THPModule_setBenchmarkCuDNN, METH_O, nullptr}, - {"_get_cudnn_deterministic", (PyCFunction)THPModule_deterministicCuDNN, METH_NOARGS, nullptr}, + {"_get_cudnn_deterministic", THPModule_deterministicCuDNN, METH_NOARGS, nullptr}, {"_set_cudnn_deterministic", (PyCFunction)THPModule_setDeterministicCuDNN, METH_O, nullptr}, - {"_get_deterministic", (PyCFunction)THPModule_deterministic, METH_NOARGS, nullptr}, + {"_get_deterministic", THPModule_deterministic, METH_NOARGS, nullptr}, {"_set_deterministic", (PyCFunction)THPModule_setDeterministic, METH_O, nullptr}, - {"_get_cublas_allow_tf32", (PyCFunction)THPModule_allowTF32CuBLAS, METH_NOARGS, nullptr}, + {"_get_cublas_allow_tf32", THPModule_allowTF32CuBLAS, METH_NOARGS, nullptr}, {"_set_cublas_allow_tf32", (PyCFunction)THPModule_setAllowTF32CuBLAS, METH_O, nullptr}, - {"_vmapmode_increment_nesting", (PyCFunction)THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr}, - {"_vmapmode_decrement_nesting", (PyCFunction)THPModule_vmapmode_decrement_nesting, METH_NOARGS, nullptr}, + {"_vmapmode_increment_nesting", THPModule_vmapmode_increment_nesting, METH_NOARGS, nullptr}, + {"_vmapmode_decrement_nesting", THPModule_vmapmode_decrement_nesting, METH_NOARGS, nullptr}, {"_to_dlpack", (PyCFunction)THPModule_toDLPack, METH_O, nullptr}, {"_from_dlpack", (PyCFunction)THPModule_fromDLPack, METH_O, nullptr}, {"set_flush_denormal", (PyCFunction)THPModule_setFlushDenormal, METH_O, nullptr}, - {"get_default_dtype", (PyCFunction)THPModule_getDefaultDtype, METH_NOARGS, nullptr}, - {"_get_default_device", (PyCFunction)THPModule_getDefaultDevice, METH_NOARGS, nullptr}, - {"_get_qengine", (PyCFunction)THPModule_qEngine, METH_NOARGS, nullptr}, + {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS, nullptr}, + {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS, nullptr}, + {"_get_qengine", THPModule_qEngine, METH_NOARGS, nullptr}, {"_set_qengine", (PyCFunction)THPModule_setQEngine, METH_O, nullptr}, - {"_supported_qengines", (PyCFunction)THPModule_supportedQEngines, METH_NOARGS, nullptr}, - {"_is_xnnpack_enabled", (PyCFunction)THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr}, - {"_is_torch_function_enabled", (PyCFunction)THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr}, + {"_supported_qengines", THPModule_supportedQEngines, METH_NOARGS, nullptr}, + {"_is_xnnpack_enabled", THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr}, + {"_is_torch_function_enabled", THPModule_isEnabledTorchFunction, METH_NOARGS, nullptr}, {"_disabled_torch_function_impl", (PyCFunction)THPModule_disable_torch_function, METH_VARARGS, nullptr}, {nullptr, nullptr, 0, nullptr} }; @@ -688,9 +695,9 @@ PyObject* initModule() { #ifdef USE_CUDA THPUtils_addPyMethodDefs(methods, THCPModule_methods()); #endif -#ifdef USE_DISTRIBUTED -#ifdef USE_C10D +#if defined(USE_DISTRIBUTED) && defined(USE_C10D) THPUtils_addPyMethodDefs(methods, torch::distributed::c10d::python_functions()); +#ifndef _WIN32 THPUtils_addPyMethodDefs(methods, torch::distributed::rpc::python_functions()); THPUtils_addPyMethodDefs( methods, torch::distributed::autograd::python_functions()); @@ -746,6 +753,7 @@ PyObject* initModule() { ASSERT_TRUE(THPQUInt8Storage_init(module)); ASSERT_TRUE(THPQInt8Storage_init(module)); ASSERT_TRUE(THPQInt32Storage_init(module)); + ASSERT_TRUE(THPQUInt4x2Storage_init(module)); ASSERT_TRUE(THPBFloat16Storage_init(module)); ASSERT_TRUE(THPComplexDoubleStorage_init(module)); ASSERT_TRUE(THPComplexFloatStorage_init(module)); @@ -821,6 +829,26 @@ Call this whenever a new thread is created in order to propagate values from ASSERT_TRUE(set_module_attr("has_mkl", at::hasMKL() ? Py_True : Py_False)); ASSERT_TRUE(set_module_attr("has_lapack", at::hasLAPACK() ? Py_True : Py_False)); + py_module.def( + "valgrind_supported_platform", [](){ + #if defined(NVALGRIND) + return false; + #else + return true; + #endif + } + ); + + py_module.def( + "valgrind_toggle", [](){ + #if defined(NVALGRIND) + TORCH_CHECK(false, "Valgrind is not supported."); + #else + CALLGRIND_TOGGLE_COLLECT; + #endif + } + ); + #ifdef USE_CUDA PyObject *has_cuda = Py_True; #else diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h index 5e708f2b4f2d..e7c8bfdbe4f2 100644 --- a/torch/csrc/Storage.h +++ b/torch/csrc/Storage.h @@ -35,7 +35,8 @@ PyObject_IsInstance(obj, THPComplexDoubleStorageClass) #define THPComplexFloatStorage_Check(obj) \ PyObject_IsInstance(obj, THPComplexFloatStorageClass) - +#define THPQUInt4x2Storage_Check(obj) \ + PyObject_IsInstance(obj, THPQUInt8StorageClass) #define THPDoubleStorage_CData(obj) (obj)->cdata #define THPFloatStorage_CData(obj) (obj)->cdata @@ -52,6 +53,7 @@ #define THPBFloat16Storage_CData(obj) (obj)->cdata #define THPComplexDoubleStorage_CData(obj) (obj)->cdata #define THPComplexFloatStorage_CData(obj) (obj)->cdata +#define THPQUInt4x2Storage_CData(obj) (obj)->cdata #define THPStorageType TH_CONCAT_3(THP,Real,StorageType) #define THPStorageBaseStr TH_CONCAT_STRING_2(Real,StorageBase) diff --git a/torch/csrc/WindowsTorchApiMacro.h b/torch/csrc/WindowsTorchApiMacro.h index 7f8ef4e01677..7f44db0baba9 100644 --- a/torch/csrc/WindowsTorchApiMacro.h +++ b/torch/csrc/WindowsTorchApiMacro.h @@ -5,3 +5,9 @@ // There's no difference between aten, torch and caffe2 libs any more // TODO: clean up the naming for consistency #define TORCH_API CAFFE2_API + +#ifdef _WIN32 +#define TORCH_PYTHON_API +#else +#define TORCH_PYTHON_API CAFFE2_API +#endif diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h index 9622f668214f..8a094ec9e235 100644 --- a/torch/csrc/api/include/torch/fft.h +++ b/torch/csrc/api/include/torch/fft.h @@ -35,6 +35,36 @@ inline Tensor ifft(const Tensor& self, return torch::fft_ifft(self, n, dim, norm); } +/// Computes the N dimensional fast Fourier transform over given dimensions. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftn. +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kComplexDouble); +/// torch::fft::fftn(t); +/// ``` +inline Tensor fftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_fftn(self, s, dim, norm); +} + +/// Computes the N dimensional fast Fourier transform over given dimensions. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftn. +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kComplexDouble); +/// torch::fft::ifftn(t); +/// ``` +inline Tensor ifftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_ifftn(self, s, dim, norm); +} + /// Computes the 1 dimensional FFT of real input with onesided Hermitian output. /// See https://pytorch.org/docs/master/fft.html#torch.fft.rfft. /// @@ -69,6 +99,36 @@ inline Tensor irfft(const Tensor& self, return torch::fft_irfft(self, n, dim, norm); } +/// Computes the N dimensional FFT of real input with onesided Hermitian output. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftn +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kDouble); +/// torch::fft::rfftn(t); +/// ``` +inline Tensor rfftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_rfftn(self, s, dim, norm); +} + +/// Computes the inverse of torch.fft.rfftn. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.irfftn. +/// +/// Example: +/// ``` +/// auto t = torch::randn({128, 128}, dtype=kComplexDouble); +/// torch::fft::irfftn(t); +/// ``` +inline Tensor irfftn(const Tensor& self, + c10::optional s=c10::nullopt, + c10::optional dim=c10::nullopt, + c10::optional norm=c10::nullopt) { + return torch::fft_irfftn(self, s, dim, norm); +} + /// Computes the 1 dimensional FFT of a onesided Hermitian signal /// /// The input represents a Hermitian symmetric time domain signal. The returned @@ -106,4 +166,66 @@ inline Tensor ihfft(const Tensor& self, return torch::fft_ihfft(self, n, dim, norm); } +/// Computes the discrete Fourier Transform sample frequencies for a signal of size n. +/// +/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq +/// +/// Example: +/// ``` +/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble); +/// ``` +inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options={}) { + return torch::fft_fftfreq(n, d, options); +} + +inline Tensor fftfreq(int64_t n, const TensorOptions& options={}) { + return torch::fft_fftfreq(n, /*d=*/1.0, options); +} + +/// Computes the sample frequencies for torch.fft.rfft with a signal of size n. +/// +/// Like torch.fft.rfft, only the positive frequencies are included. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq +/// +/// Example: +/// ``` +/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble); +/// ``` +inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) { + return torch::fft_rfftfreq(n, d, options); +} + +inline Tensor rfftfreq(int64_t n, const TensorOptions& options) { + return torch::fft_rfftfreq(n, /*d=*/1.0, options); +} + +/// Reorders n-dimensional FFT output to have negative frequency terms first, by +/// a torch.roll operation. +/// +/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift +/// +/// Example: +/// ``` +/// auto x = torch::randn({127, 4}); +/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x)); +/// ``` +inline Tensor fftshift(const Tensor& x, c10::optional dim=c10::nullopt) { + return torch::fft_fftshift(x, dim); +} + +/// Inverse of torch.fft.fftshift +/// +/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift +/// +/// Example: +/// ``` +/// auto x = torch::randn({127, 4}); +/// auto shift = torch::fft::fftshift(x) +/// auto unshift = torch::fft::ifftshift(shift); +/// assert(torch::allclose(x, unshift)); +/// ``` +inline Tensor ifftshift(const Tensor& x, c10::optional dim=c10::nullopt) { + return torch::fft_ifftshift(x, dim); +} + }} // torch::fft diff --git a/torch/csrc/api/include/torch/nn/functional/loss.h b/torch/csrc/api/include/torch/nn/functional/loss.h index b5a06f4cfb14..6ed3c37311c0 100644 --- a/torch/csrc/api/include/torch/nn/functional/loss.h +++ b/torch/csrc/api/include/torch/nn/functional/loss.h @@ -307,9 +307,9 @@ inline Tensor cosine_embedding_loss( // ============================================================================ -inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target) { +inline Tensor _smooth_l1_loss(const Tensor& input, const Tensor& target, double beta = 1.) { auto t = torch::abs(input - target); - return torch::where(t < 1, 0.5 * torch::pow(t, 2), t - 0.5); + return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta); } #ifndef DOXYGEN_SHOULD_SKIP_THIS @@ -317,7 +317,8 @@ namespace detail { inline Tensor smooth_l1_loss( const Tensor& input, const Tensor& target, - SmoothL1LossFuncOptions::reduction_t reduction) { + SmoothL1LossFuncOptions::reduction_t reduction, + double beta = 1.) { if (target.sizes() != input.sizes()) { TORCH_WARN("Using a target size (", target.sizes(), ") that is different to the input size (", input.sizes(), "). ", "This will likely lead to incorrect results due to broadcasting. ", @@ -325,7 +326,7 @@ inline Tensor smooth_l1_loss( } std::vector expanded_tensors = torch::broadcast_tensors({input, target}); - return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction)); + return torch::smooth_l1_loss(expanded_tensors[0], expanded_tensors[1], enumtype::reduction_get_enum(reduction), beta); } } // namespace detail #endif /* DOXYGEN_SHOULD_SKIP_THIS */ @@ -344,8 +345,9 @@ inline Tensor smooth_l1_loss( inline Tensor smooth_l1_loss( const Tensor& input, const Tensor& target, - const SmoothL1LossFuncOptions& options = {}) { - return detail::smooth_l1_loss(input, target, options.reduction()); + const SmoothL1LossFuncOptions& options = {}, + double beta = 1.) { + return detail::smooth_l1_loss(input, target, options.reduction(), beta); } // ============================================================================ @@ -525,6 +527,85 @@ inline Tensor triplet_margin_loss( // ============================================================================ +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace detail { +inline Tensor triplet_margin_with_distance_loss( + const Tensor& anchor, + const Tensor& positive, + const Tensor& negative, + c10::optional distance_function, + double margin, + bool swap, + TripletMarginWithDistanceLossFuncOptions::reduction_t reduction) { + Tensor dist_pos, dist_neg; + if (distance_function.has_value()) { + auto distance_function_impl = distance_function.value(); + dist_pos = distance_function_impl(anchor, positive); + dist_neg = distance_function_impl(anchor, negative); + } else { + dist_pos = pairwise_distance(anchor, positive); + dist_neg = pairwise_distance(anchor, negative); + } + + if (swap) { + Tensor dist_swap; + if (distance_function.has_value()) { + dist_swap = distance_function.value()(positive, negative); + } else { + dist_swap = pairwise_distance(positive, negative); + } + dist_neg = torch::min(dist_neg, dist_swap); + } + + auto loss = torch::clamp_min(dist_pos - dist_neg + margin, 0); + + Tensor ret; + if (c10::get_if(&reduction)) { + ret = loss; + } else if (c10::get_if(&reduction)) { + ret = loss.mean(); + } else if (c10::get_if(&reduction)) { + ret = loss.sum(); + } else { + ret = anchor; + TORCH_INTERNAL_ASSERT( + false, + enumtype::get_enum_name(reduction), + " is not valid"); + } + return ret; +} +} // namespace detail +#endif /* DOXYGEN_SHOULD_SKIP_THIS */ + +/// See https://pytorch.org/docs/master/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss +/// about the exact behavior of this functional. +/// +/// See the documentation for `torch::nn::functional::TripletMarginWithDistanceLossFuncOptions` class to learn what +/// optional arguments are supported for this functional. +/// +/// Example: +/// ``` +/// namespace F = torch::nn::functional; +/// F::triplet_margin_with_distance_loss(anchor, positive, negative, F::TripletMarginWithDistanceLossFuncOptions().margin(1.0)); +/// ``` +inline Tensor triplet_margin_with_distance_loss( + const Tensor& anchor, + const Tensor& positive, + const Tensor& negative, + const TripletMarginWithDistanceLossFuncOptions& options = {}) { + return detail::triplet_margin_with_distance_loss( + anchor, + positive, + negative, + options.distance_function(), + options.margin(), + options.swap(), + options.reduction()); +} + +// ============================================================================ + #ifndef DOXYGEN_SHOULD_SKIP_THIS namespace detail { inline Tensor ctc_loss(const Tensor& log_probs, diff --git a/torch/csrc/api/include/torch/nn/modules/loss.h b/torch/csrc/api/include/torch/nn/modules/loss.h index d136f9cb7ee9..8c9308864842 100644 --- a/torch/csrc/api/include/torch/nn/modules/loss.h +++ b/torch/csrc/api/include/torch/nn/modules/loss.h @@ -309,7 +309,7 @@ struct TORCH_API SmoothL1LossImpl : public Cloneable { TORCH_MODULE(SmoothL1Loss); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelMarginLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - + /// Creates a criterion that optimizes a multi-class multi-classification /// hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) /// and output :math:`y` (which is a 2D `Tensor` of target class indices). @@ -421,9 +421,9 @@ TORCH_MODULE(MultiLabelSoftMarginLoss); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// Creates a criterion that measures the triplet loss given an input -/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater +/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater /// than :math:`0`. This is used for measuring a relative similarity between -/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`, +/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`, /// `positive examples` and `negative examples` respectively). The /// shapes of all input tensors should be :math:`(N, D)`. /// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginLoss to learn @@ -461,6 +461,50 @@ struct TORCH_API TripletMarginLossImpl : public Cloneable /// module storage semantics. TORCH_MODULE(TripletMarginLoss); +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginWithDistanceLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +/// Creates a criterion that measures the triplet loss given input +/// tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor, +/// positive, and negative examples, respectively); and a nonnegative, real-valued function +/// ("distance function") used to compute the relationships between the anchor +/// and positive example ("positive distance") and the anchor and negative +/// example ("negative distance"). +/// See https://pytorch.org/docs/master/nn.html#torch.nn.TripletMarginWithDistanceLoss to learn +/// about the exact behavior of this module. +/// +/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions` class to learn what +/// constructor arguments are supported for this module. +/// +/// Example: +/// ``` +/// TripletMarginWithDistanceLoss model(TripletMarginWithDistanceLossOptions().margin(3).swap(false)); +/// ``` +struct TORCH_API TripletMarginWithDistanceLossImpl : public Cloneable { + explicit TripletMarginWithDistanceLossImpl( + TripletMarginWithDistanceLossOptions options_ = {}); + + void reset() override; + + /// Pretty prints the `TripletMarginWithDistanceLoss` module into the given `stream`. + void pretty_print(std::ostream& stream) const override; + + Tensor forward( + const Tensor& anchor, + const Tensor& positive, + const Tensor& negative); + + /// The options with which this `Module` was constructed. + TripletMarginWithDistanceLossOptions options; +}; + +/// A `ModuleHolder` subclass for `TripletMarginWithDistanceLossImpl`. +/// See the documentation for `TripletMarginWithDistanceLossImpl` class to learn what methods it +/// provides, and examples of how to use `TripletMarginWithDistanceLoss` with +/// `torch::nn::TripletMarginWithDistanceLossOptions`. +/// See the documentation for `ModuleHolder` to learn about PyTorch's +/// module storage semantics. +TORCH_MODULE(TripletMarginWithDistanceLoss); + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CTCLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /// The Connectionist Temporal Classification loss. @@ -626,9 +670,9 @@ TORCH_MODULE(NLLLoss); struct TORCH_API CrossEntropyLossImpl : public Cloneable { explicit CrossEntropyLossImpl( const CrossEntropyLossOptions& options_ = {}); - + void reset() override; - + /// Pretty prints the `CrossEntropyLoss` module into the given `stream`. void pretty_print(std::ostream& stream) const override; diff --git a/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/torch/csrc/api/include/torch/nn/modules/transformercoder.h index 04518a177333..6b69f53ecf33 100644 --- a/torch/csrc/api/include/torch/nn/modules/transformercoder.h +++ b/torch/csrc/api/include/torch/nn/modules/transformercoder.h @@ -32,6 +32,8 @@ namespace nn { class TORCH_API TransformerEncoderImpl : public Cloneable { public: + TransformerEncoderImpl(TransformerEncoderLayer encoder_layer, int64_t num_layers) + : TransformerEncoderImpl(TransformerEncoderOptions(encoder_layer, num_layers)) {} explicit TransformerEncoderImpl(TransformerEncoderOptions options_); Tensor forward( diff --git a/torch/csrc/api/include/torch/nn/options/loss.h b/torch/csrc/api/include/torch/nn/options/loss.h index 16cdd02aa562..e175aa02294a 100644 --- a/torch/csrc/api/include/torch/nn/options/loss.h +++ b/torch/csrc/api/include/torch/nn/options/loss.h @@ -388,6 +388,51 @@ using TripletMarginLossFuncOptions = TripletMarginLossOptions; // ============================================================================ +/// Options for the `TripletMarginWithDistanceLoss` module. +/// +/// Example: +/// ``` +/// TripletMarginWithDistanceLoss model(TripletMarginWithDistanceLossOptions().margin(3).swap(false)); +/// ``` +struct TORCH_API TripletMarginWithDistanceLossOptions { + typedef c10::variant reduction_t; + typedef std::function distance_function_t; + + /// Specifies a nonnegative, real-valued function that quantifies the + /// closeness of two tensors. If not specified, `F::pairwise_distance` will + /// be used. Default: nullopt + TORCH_ARG(c10::optional, distance_function) = c10::nullopt; + /// Specifies a nonnegative margin representing the minimum difference + /// between the positive and negative distances required for the loss to be 0. + /// Larger margins penalize cases where the negative examples are not distance + /// enough from the anchors, relative to the positives. Default: 1 + TORCH_ARG(double, margin) = 1.0; + /// Whether to use the distance swap described in the paper Learning shallow + /// convolutional feature descriptors with triplet losses by V. Balntas, + /// E. Riba et al. If True, and if the positive example is closer to the + /// negative example than the anchor is, swaps the positive example and the + /// anchor in the loss computation. Default: False + TORCH_ARG(bool, swap) = false; + /// Specifies the reduction to apply to the output. Default: Mean + TORCH_ARG(reduction_t, reduction) = torch::kMean; +}; + +namespace functional { +/// Options for `torch::nn::functional::triplet_margin_with_distance_loss`. +/// +/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions` class to learn what +/// arguments are supported. +/// +/// Example: +/// ``` +/// namespace F = torch::nn::functional; +/// F::triplet_margin_with_distance_loss(anchor, positive, negative, F::TripletMarginWithDistanceLossFuncOptions().margin(1.0)); +/// ``` +using TripletMarginWithDistanceLossFuncOptions = TripletMarginWithDistanceLossOptions; +} // namespace functional + +// ============================================================================ + /// Options for the `CTCLoss` module. /// /// Example: diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp index 43ab1119def9..4b41b88c420c 100644 --- a/torch/csrc/api/src/nn/modules/loss.cpp +++ b/torch/csrc/api/src/nn/modules/loss.cpp @@ -180,6 +180,33 @@ Tensor TripletMarginLossImpl::forward( // ============================================================================ +TripletMarginWithDistanceLossImpl::TripletMarginWithDistanceLossImpl( + TripletMarginWithDistanceLossOptions options_) + : options(std::move(options_)) {} + +void TripletMarginWithDistanceLossImpl::reset() {} + +void TripletMarginWithDistanceLossImpl::pretty_print(std::ostream& stream) const { + stream << "torch::nn::TripletMarginWithDistanceLoss(margin=" << options.margin() + << std::boolalpha << ", swap=" << options.swap() << ")"; +} + +Tensor TripletMarginWithDistanceLossImpl::forward( + const Tensor& anchor, + const Tensor& positive, + const Tensor& negative) { + return F::detail::triplet_margin_with_distance_loss( + anchor, + positive, + negative, + options.distance_function(), + options.margin(), + options.swap(), + options.reduction()); +} + +// ============================================================================ + MultiLabelMarginLossImpl::MultiLabelMarginLossImpl( const torch::nn::MultiLabelMarginLossOptions& options_) : options(options_) {} @@ -223,9 +250,9 @@ void SmoothL1LossImpl::pretty_print(std::ostream& stream) const { Tensor SmoothL1LossImpl::forward(const Tensor& input, const Tensor& target) { return F::detail::smooth_l1_loss(input, target, options.reduction()); } - + // ============================================================================ - + CTCLossImpl::CTCLossImpl(const CTCLossOptions& options_) : options(options_) {} void CTCLossImpl::reset() {} diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 29f0720fb3c7..1314a98e9562 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -87,6 +87,14 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) { return tensor; } +Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result) { + if (!at::isComplexType(self_st) && gradient_result.is_complex()) { + // R -> C + return at::real(gradient_result); + } + return gradient_result; +} + Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim) { if (keepdim) { return output; @@ -211,6 +219,17 @@ Tensor mvlgamma_backward(Tensor grad, const Tensor & self, int64_t p) { return grad * args.digamma_().sum(-1); } +Tensor sgn_backward(Tensor result, Tensor grad, Tensor self) { + if (self.is_complex()) { + auto abs = at::abs(self); + // C -> C + // https://arxiv.org/pdf/1701.00392.pdf Section 4.20 + return at::where(abs == 0.0, at::zeros({}, grad.options()), (grad/abs - (at::real(grad/self) * result))); + } else { + return at::zeros_like(grad, at::MemoryFormat::Preserve); + } +} + Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) { auto result = grad * other.conj(); if (!at::isComplexType(self_st) && result.is_complex()) { @@ -220,6 +239,24 @@ Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st) { return result; } +Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st) { + auto result = grad / other.conj(); + if (!at::isComplexType(self_st) && result.is_complex()) { + // R -> C + result = at::real(result); + } + return result; +} + +Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other) { + auto result = -grad * ((self / other) / other).conj(); + if (!other.is_complex() && result.is_complex()) { + // R -> C + result = at::real(result); + } + return result; +} + Tensor permute_backwards(const Tensor & grad, IntArrayRef fwd_dims) { // invert the permutation auto ndims = fwd_dims.size(); @@ -563,7 +600,12 @@ Tensor sum_tensorlist(TensorList tl) { return sum; } -Tensor repeat_backward(Tensor grad, int64_t input_dims, IntArrayRef repeats) { +Tensor repeat_backward(Tensor grad, IntArrayRef repeats, IntArrayRef input_shape) { + auto find_iter = std::find(repeats.cbegin(), repeats.cend(), 0); + if (find_iter != repeats.cend()) { + return at::zeros(input_shape, grad.options()); + } + const auto input_dims = input_shape.size(); int64_t num_unsqueezed = grad.dim() - input_dims; for (int64_t i = 0; i < num_unsqueezed; ++i) { grad = grad.sum(0, false); @@ -590,13 +632,12 @@ Tensor _fused_dropout_backward(Tensor grad, Tensor mask, double p1m) { } Tensor evenly_distribute_backward(Tensor grad, const Tensor & input, const Tensor & value) { - auto mask = (input == value); - auto count = mask.sum(); - auto grad_input = grad / count; if (input.is_cuda()) { - return mask * grad_input; + auto mask = (input == value).logical_or_(input.isnan().logical_and_(value.isnan())); + return mask * (grad / mask.sum()); } else { - return at::zeros_like(input).masked_fill_(mask, grad_input); + auto mask = value.isnan().item() ? input.isnan() : input == value; + return at::zeros_like(input).masked_fill_(mask, grad / mask.sum()); } } @@ -615,11 +656,11 @@ Tensor var_backward(Tensor grad, const Tensor & self, IntArrayRef dim, bool unbi } Tensor std_backward(const Tensor & result, const Tensor & grad, const Tensor & self, bool unbiased) { - return var_backward(grad / (result * 2), self, unbiased); + return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, unbiased); } Tensor std_backward(const Tensor & result, Tensor grad, const Tensor & self, IntArrayRef dim, bool unbiased, bool keepdim) { - return var_backward(grad / (result * 2), self, dim, unbiased, keepdim); + return var_backward((grad / (result * 2)).masked_fill_(result == 0, 0), self, dim, unbiased, keepdim); } Tensor mean_backward(Tensor grad, const IntArrayRef sizes, IntArrayRef dim, bool keepdim) { @@ -682,15 +723,15 @@ Tensor cholesky_backward(Tensor grad, bool upper, Tensor L) { // leads to stable gradient updates, and retains symmetry of the updated matrix if it // were updated by a gradient based algorithm. if (upper) { - L = L.transpose(-1, -2); - grad = grad.transpose(-1, -2); + L = L.transpose(-1, -2).conj(); + grad = grad.transpose(-1, -2).conj(); } auto L_inverse = std::get<0>(at::triangular_solve(at::eye(L.size(-1), L.options()), L, /*upper=*/false)); - auto phi = at::matmul(L.transpose(-1, -2), grad); + auto phi = at::matmul(L.transpose(-1, -2).conj(), grad); phi.tril_().diagonal(/*offset=*/0, /*dim1=*/-2, /*dim2=*/-1).mul_(0.5); - auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2), phi), L_inverse); - return grad_input.add(grad_input.transpose(-1, -2)).mul_(0.5); // Symmetrizing the gradient + auto grad_input = at::matmul(at::matmul(L_inverse.transpose(-1, -2).conj(), phi), L_inverse); + return grad_input.add(grad_input.transpose(-1, -2).conj()).mul_(0.5); // Symmetrizing the gradient } Tensor cholesky_inverse_backward(Tensor grad, Tensor L, bool upper, Tensor inverse) { @@ -923,20 +964,24 @@ Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & i return output; } -Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) { +Tensor smooth_l1_loss_double_backward(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction, double beta) { + // special case to protect against a divide-by-zero. + if (beta == 0) { + return at::zeros(grad.sizes(), grad.options()); + } auto d = (input - target).abs(); - auto grad_input = grad * (d < 1).type_as(grad); + auto grad_input = grad * (d < beta).type_as(grad) / beta; if (reduction == at::Reduction::Mean) { grad_input /= input.numel(); } return grad_input; } -Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction) { +Tensor smooth_l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & grad_output, const Tensor & input, const Tensor & target, int64_t reduction, double beta) { if (reduction == at::Reduction::None) { - return smooth_l1_loss_backward(grad, input, target, reduction); + return smooth_l1_loss_backward(grad, input, target, reduction, beta); } - auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction); + auto r = smooth_l1_loss_backward(ones_like(grad_output), input, target, reduction, beta); return (r * grad).sum(); } diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index b4e7d1667f88..00171cbbf656 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -33,6 +33,7 @@ bool any_variable_defined(variable_list& variables); void copy_range(variable_list& out, IndexRange range, const at::Tensor & t); void copy_range(variable_list& out, IndexRange range, at::ArrayRef t); at::Tensor not_implemented(const char* name); +at::Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result); at::Tensor maybe_multiply(const at::Tensor & t, const at::Scalar & s); int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim); Tensor restore_reduced_dims(const Tensor &output, IntArrayRef dims, bool keepdim); @@ -44,6 +45,8 @@ at::Tensor pow_backward_self(at::Tensor grad, const at::Tensor & self, const at: at::Tensor pow_backward_exponent(at::Tensor grad, const at::Tensor& self, const at::Tensor& exponent, at::Tensor result); at::Tensor pow_backward_exponent(at::Tensor grad, const at::Scalar & base, const at::Tensor& exponent, at::Tensor result); at::Tensor mul_tensor_backward(Tensor grad, Tensor other, ScalarType self_st); +at::Tensor div_tensor_self_backward(Tensor grad, Tensor other, ScalarType self_st); +at::Tensor div_tensor_other_backward(Tensor grad, Tensor self, Tensor other); at::Tensor mvlgamma_backward(at::Tensor grad, const at::Tensor & self, int64_t p); at::Tensor permute_backwards(const at::Tensor & grad, at::IntArrayRef fwd_dims); at::Tensor rad2deg_backward(const at::Tensor& grad); @@ -71,9 +74,10 @@ at::Tensor mm_mat2_backward(const at::Tensor & grad, const at::Tensor & mat1, at at::Tensor _sparse_addmm_sparse_backward(const at::Tensor& grad, const at::Tensor& sparse_, const at::Tensor& dense, const at::Scalar& alpha); at::Tensor renorm_backward(const at::Tensor & grad, const at::Tensor & self, at::Scalar p, int64_t dim, at::Scalar maxnorm); at::Tensor sum_tensorlist(at::TensorList tl); -at::Tensor repeat_backward(at::Tensor grad, int64_t input_dims, at::IntArrayRef repeats); +at::Tensor repeat_backward(at::Tensor grad, at::IntArrayRef repeats, at::IntArrayRef input_shape); at::Tensor _fused_dropout_backward(at::Tensor grad, at::Tensor mask, double p1m); at::Tensor evenly_distribute_backward(at::Tensor grad, const at::Tensor & input, const at::Tensor & value); +at::Tensor sgn_backward(Tensor result, Tensor grad, Tensor self); at::Tensor var_backward(const at::Tensor & grad, const at::Tensor & self, bool unbiased); at::Tensor var_backward(at::Tensor grad, const at::Tensor & self, at::IntArrayRef dim, bool unbiased, bool keepdim); at::Tensor std_backward(const at::Tensor & result, const at::Tensor & grad, const at::Tensor & self, bool unbiased); @@ -101,8 +105,8 @@ at::Tensor log_softmax_double_backward(const at::Tensor & grad, const at::Tensor at::Tensor binary_cross_entropy_double_backward(const at::Tensor & grad_output, const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional& weight, int64_t reduction); at::Tensor binary_cross_entropy_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, const c10::optional& weight, int64_t reduction); at::Tensor l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction); -at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction); -at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction); +at::Tensor smooth_l1_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta); +at::Tensor smooth_l1_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction, double beta); at::Tensor mse_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, int64_t reduction); at::Tensor mse_loss_double_backward_grad_output(const at::Tensor & grad, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & target, int64_t reduction); at::Tensor soft_margin_loss_double_backward(const at::Tensor & grad, const at::Tensor & input, const at::Tensor & target, int64_t reduction); diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index c72c67eb5230..9dfc4573188a 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -93,7 +93,7 @@ void backward( torch::autograd::backward({self}, {_gradient}, std::move(keep_graph), create_graph); } -void set_data(const Tensor & self, const Tensor & new_data) { +void set_data(Tensor & self, const Tensor & new_data) { // `var.set_data(new_data)` shallow-copies all non-autograd TensorImpl fields // from `new_data` to `var`. It requires that `new_data` and `var` have compatible // tensor type. @@ -160,7 +160,7 @@ Tensor& requires_grad_(Tensor& self, bool _requires_grad) { return self.set_requires_grad(_requires_grad); } -void retain_grad(const Tensor & self) { +void retain_grad(Tensor & self) { TORCH_CHECK(self.requires_grad(), "can't retain_grad on Tensor that has requires_grad=False"); if (self.is_leaf()) { // no-op for leaves return; @@ -269,7 +269,12 @@ Tensor & detach_(Tensor & self) { "of detach_(). Alternatively, create this view with an " "`unsafe_` version of the function that produced it."); } else { - AT_ERROR("Can't detach views in-place. Use detach() instead"); + AT_ERROR("If you are using DistributedDataParallel (DDP) for training, " + "and gradient_as_bucket_view is set as True, gradients are " + "views of DDP buckets, and hence detach_() cannot be called " + "on these gradients. To fix this error, please refer to the " + "Optimizer.zero_grad() function in torch/optim/optimizer.py " + "as the solution."); } } // I think the choice here is conservative. In principle, doing diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index 692972533adc..2ef1415cc937 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -67,6 +67,19 @@ inline void throw_error_out_requires_grad(const char* name) { "but one of the arguments requires grad."); } +inline void throw_error_for_complex_autograd(const Tensor& tensor, const char* name) { + if (tensor.requires_grad()) { + TORCH_CHECK(!tensor.is_complex(), name, + " does not support automatic differentiation for outputs with complex dtype."); + } +} + +inline void throw_error_for_complex_autograd(const TensorList& tensorlist, const char* name) { + for (auto tensor: tensorlist) { + throw_error_for_complex_autograd(tensor, name); + } +} + // TODO: Blegh, bare references inline void rebase_history(Variable& var, std::shared_ptr grad_fn) { diff --git a/torch/csrc/autograd/autograd.cpp b/torch/csrc/autograd/autograd.cpp index ab02a03279a1..b8756ff1c7b4 100644 --- a/torch/csrc/autograd/autograd.cpp +++ b/torch/csrc/autograd/autograd.cpp @@ -75,10 +75,6 @@ variable_list run_backward( for (size_t i = 0; i < num_tensors; i++) { const Variable& output = outputs[i]; auto gradient_edge = impl::gradient_edge(output); - if(output.is_complex()) { - TORCH_WARN_ONCE("Complex backward is not fully supported yet and could lead to wrong ", - "gradients for functions we have not fixed yet"); - } TORCH_CHECK( gradient_edge.function, "element ", i, " of tensors does not require grad and does not have a grad_fn"); diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 62ca26e46939..e952b0afc772 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -227,7 +227,7 @@ Engine::~Engine() { // Do not wait for termination of global threads on Windows // Because CRT terminates DLL threads before calling // global object destructors -#if !defined(_WIN32) || !defined(C10_BUILD_SHARED_LIBS) +#if !defined(_WIN32) || defined(C10_USE_MSVC_STATIC_RUNTIME) std::unique_lock lk(non_reentrant_device_thread_mutex_); while(non_reentrant_device_thread_count_.load() != 0) { non_reentrant_device_thread_condvar_.wait(lk); @@ -513,12 +513,10 @@ void GraphTask::exec_post_processing() { } void GraphTask::set_exception_without_signal(const std::shared_ptr& fn) { - std::unique_lock lock(mutex_); - if (!has_error_.load()) { + if (!has_error_.exchange(true)) { if (AnomalyMode::is_enabled() && fn) { fn->metadata()->print_stack(fn->name()); } - has_error_ = true; } } diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 17d4f5473880..65d94717a84b 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -114,6 +114,10 @@ struct TORCH_API Node : std::enable_shared_from_this { // We are tracking the parents to track multiple backward operations. assign_parent(); } + + if (profiler::profilerEnabled()) { + thread_id_ = at::RecordFunction::currentThreadId(); + } } explicit Node(edge_list&& next_edges = edge_list()) @@ -129,8 +133,21 @@ struct TORCH_API Node : std::enable_shared_from_this { /// Evaluates the function on the given inputs and returns the result of the /// function call. variable_list operator()(variable_list&& inputs) { - RECORD_FUNCTION( - name(), std::vector(inputs.begin(), inputs.end()), sequence_nr()); + // Using RecordFunction to trogger observers in the backward pass + at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION); + if (guard.active) { + // Using sequence number and thread id to correlate with + // the forward pass function + guard.setForwardThreadId(thread_id_); + if (guard.needs_inputs) { + guard.before( + name(), + std::vector(inputs.begin(), inputs.end()), + sequence_nr()); + } else { + guard.before(name(), sequence_nr()); + } + } // In the first iteration of named tensors, autograd ignores names and // operates on unnamed tensors. In the long term, autograd should // probably operate with names. @@ -241,6 +258,11 @@ struct TORCH_API Node : std::enable_shared_from_this { // assigning a node as a parent to this node void assign_parent(); + /// Id of the thread that created Node + uint64_t thread_id() const noexcept { + return thread_id_; + } + /// Returns the name of the dynamic type of the function, for debugging. virtual std::string name() const; @@ -362,6 +384,9 @@ struct TORCH_API Node : std::enable_shared_from_this { // fields. const uint64_t sequence_nr_; + // Id of the thread that created the instance + uint64_t thread_id_ = 0; + // Note [Thread Safety on Autograd Node] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Autograd Engine let the owning thread which calls Engine::execute to drive the diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index e1a02dc19fd8..dafd07f64b84 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -161,6 +161,11 @@ struct TORCH_API AccumulateGrad : public Node { // valid operation which adds `new_grad` to `variable_grad` in // place. `variable_grad` is thus still referring to the same tensor // after the operation. + // Also DistributedDataParallel(DDP) package relies on grad being + // mutated in place for saving peak memory usage. DDP will still + // work correctly if it is mutated out of place here, but DDP will + // maintain one extra copy of grad tensors in buffer and thus + // increase peak memory usage. variable_grad += new_grad; CHECK_RESULT(variable_grad, variable); // ^ We could enforce the contract more aggressively here by writing: diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index c92654cf7815..045a732a2016 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -42,26 +42,35 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("NVTX", ProfilerState::NVTX); py::class_(m, "ProfilerConfig") - .def(py::init()); + .def(py::init()); py::class_(m, "ProfilerEvent") .def("kind", &Event::kind) .def("name", [](const Event& e) { return e.name(); }) - .def("thread_id", &Event::thread_id) + .def("thread_id", &Event::threadId) + .def("fwd_thread_id", &Event::fwdThreadId) .def("device", &Event::device) - .def("cpu_elapsed_us", &Event::cpu_elapsed_us) - .def("cuda_elapsed_us", &Event::cuda_elapsed_us) - .def("has_cuda", &Event::has_cuda) + .def("cpu_elapsed_us", &Event::cpuElapsedUs) + .def("cuda_elapsed_us", &Event::cudaElapsedUs) + .def("has_cuda", &Event::hasCuda) .def("shapes", &Event::shapes) - .def("cpu_memory_usage", &Event::cpu_memory_usage) - .def("cuda_memory_usage", &Event::cuda_memory_usage) + .def("cpu_memory_usage", &Event::cpuMemoryUsage) + .def("cuda_memory_usage", &Event::cudaMemoryUsage) .def("handle", &Event::handle) - .def("node_id", &Event::node_id) + .def("node_id", &Event::nodeId) .def("is_remote", &Event::isRemote) - .def("sequence_nr", &Event::sequence_nr); + .def("sequence_nr", &Event::sequenceNr) + .def("stack", &Event::stack) + .def("scope", &Event::scope); + + py::class_(m, "_ProfilerDisableOptions") + .def(py::init()); m.def("_enable_profiler", enableProfiler); - m.def("_disable_profiler", disableProfiler); + m.def( + "_disable_profiler", + disableProfiler, + py::arg("profiler_disable_options") = ProfilerDisableOptions()); m.def("_profiler_enabled", profilerEnabled); m.def("_enable_record_function", [](bool enable) { at::enableRecordFunction(enable); diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 049f857f8bbf..5cbb7606e579 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -24,27 +25,46 @@ namespace torch { namespace autograd { namespace profiler { namespace { - enum EventIValueIdx { - KIND = 0, - NAME, - THREAD_ID, - HANDLE, - NODE_ID, - CPU_MEM_USAGE, - CPU_NS, - CUDA_RECORDED, - CUDA_MEM_USAGE, - CUDA_DEVICE, - CUDA_US, - NUM_EVENT_IVALUE_IDX // must be last in list - }; +enum EventIValueIdx { + KIND = 0, + NAME, + THREAD_ID, + HANDLE, + NODE_ID, + CPU_MEM_USAGE, + CPU_NS, + CUDA_RECORDED, + CUDA_MEM_USAGE, + CUDA_DEVICE, + CUDA_US, + SHAPES, + NUM_EVENT_IVALUE_IDX // must be last in list +}; - enum ProfilerIValueIdx { - STATE = 0, - REPORT_INPUT_SHAPES, - PROFILE_MEMORY, - NUM_PROFILER_CFG_IVALUE_IDX // must be last in list - }; +enum ProfilerIValueIdx { + STATE = 0, + REPORT_INPUT_SHAPES, + PROFILE_MEMORY, + NUM_PROFILER_CFG_IVALUE_IDX // must be last in list +}; + + const std::unordered_set disable_cuda_profiling = { + "aten::view", + "aten::t", + "aten::transpose", + "aten::stride", + "aten::empty", + "aten::empty_like", + "aten::empty_strided", + "aten::as_strided", + "aten::expand", + "aten::resize_", + "aten::squeeze", + "aten::unsqueeze", + "aten::slice", + "aten::_unsafe_view", + "aten::size" + }; CUDAStubs default_stubs; constexpr CUDAStubs* default_stubs_addr = &default_stubs; @@ -116,8 +136,9 @@ static CUDAStubs* cuda_stubs = default_stubs_addr; // - TorchScript functions/methods // - user defined named ranges (see `record_function` python context manager) // -// Profiler setups a pair of callbacks that record profiling events and save them -// into the thread local profiler struct (ThreadLocalDebugInfo, PROFILER_STATE slot) +// Profiler setups a pair of callbacks that record profiling events and save +// them into the thread local profiler struct (ThreadLocalDebugInfo, +// PROFILER_STATE slot) // // // Thus, the overall logic is: @@ -142,12 +163,16 @@ static CUDAStubs* cuda_stubs = default_stubs_addr; // - save profiling events into the profiling state // +struct FileLineFunc { + std::string filename; + size_t line; + std::string funcname; +}; + // Profiler state -struct ProfilerThreadLocalState - : public c10::MemoryReportingInfoBase { - explicit ProfilerThreadLocalState( - const ProfilerConfig& config) - : config_(config), remoteProfiledEvents_{c10::nullopt} {} +struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { + explicit ProfilerThreadLocalState(const ProfilerConfig& config) + : config_(config), remoteProfiledEvents_{c10::nullopt} {} ~ProfilerThreadLocalState() override = default; inline const ProfilerConfig& config() const { @@ -171,9 +196,7 @@ struct ProfilerThreadLocalState return result; } - void mark( - std::string name, - bool include_cuda = true) { + void mark(std::string name, bool include_cuda = true) { if (config_.state == ProfilerState::Disabled) { return; } @@ -181,17 +204,17 @@ struct ProfilerThreadLocalState cuda_stubs->nvtxMarkA(name.c_str()); } else { Event evt( - EventKind::Mark, - at::StringView(std::move(name)), - at::RecordFunction::currentThreadId(), - include_cuda && config_.state == ProfilerState::CUDA - ); + EventKind::Mark, + at::StringView(std::move(name)), + at::RecordFunction::currentThreadId(), + include_cuda && config_.state == ProfilerState::CUDA); evt.setNodeId(at::RecordFunction::getDefaultNodeId()); getEventList().record(std::move(evt)); } } - void setOrAddRemoteProfiledEvents(std::vector&& remoteProfiledEvents) { + void setOrAddRemoteProfiledEvents( + std::vector&& remoteProfiledEvents) { // Lock to serialize access from multiple callback threads. std::lock_guard guard(state_mutex_); if (remoteProfiledEvents_) { @@ -202,31 +225,44 @@ struct ProfilerThreadLocalState } void pushRange( - const at::StringView& name, + const at::RecordFunction& fn, + const bool record_cuda, const char* msg = "", - int64_t sequence_nr = -1, - std::vector>&& shapes = {}, - at::RecordFunctionHandle handle = 0) { + std::vector>&& shapes = {}) { if (config_.state == ProfilerState::Disabled) { return; } if (config_.state == ProfilerState::NVTX) { cuda_stubs->nvtxRangePushA(getNvtxStr( - name, msg, sequence_nr, shapes).c_str()); + fn.name(), msg, fn.seqNr(), shapes).c_str()); } else { - Event evt(EventKind::PushRange, - name, + Event evt( + EventKind::PushRange, + fn.name(), at::RecordFunction::currentThreadId(), - config_.state == ProfilerState::CUDA, - handle, + record_cuda, + fn.handle(), std::move(shapes), at::RecordFunction::getDefaultNodeId()); - evt.setSequenceNr(sequence_nr); + evt.setSequenceNr(fn.seqNr()); + evt.setFwdThreadId(fn.forwardThreadId()); + evt.setScope((uint8_t)fn.scope()); +#ifndef C10_MOBILE + // backward nodes source range corresponds to the forward node + // TODO: consider using C++ stack trace + if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { + auto cs = prepareCallstack(jit::currentCallstack()); + if (cs.empty()) { + cs = prepareCallstack(jit::tracer::pythonCallstack()); + } + evt.setStack(callstackStr(cs)); + } +#endif getEventList().record(std::move(evt)); } } - void popRange(uint64_t thread_id, at::RecordFunctionHandle handle) { + void popRange(const at::RecordFunction& fn, const bool record_cuda) { if (config_.state == ProfilerState::Disabled) { return; } @@ -237,13 +273,14 @@ struct ProfilerThreadLocalState // called on a different thread than pushRange // As a convention, we put the async pop on the original // thread and save current thread id in pop event - Event evt(EventKind::PopRange, + Event evt( + EventKind::PopRange, at::StringView(""), at::RecordFunction::currentThreadId(), - config_.state == ProfilerState::CUDA, - handle); + record_cuda, + fn.handle()); evt.setNodeId(at::RecordFunction::getDefaultNodeId()); - getEventList(thread_id).record(std::move(evt)); + getEventList(fn.threadId()).record(std::move(evt)); } } @@ -256,7 +293,9 @@ struct ProfilerThreadLocalState } void reportMemoryUsage( - void* /* unused */, int64_t alloc_size, c10::Device device) override { + void* /* unused */, + int64_t alloc_size, + c10::Device device) override { if (config_.profile_memory && config_.state != ProfilerState::Disabled) { uint64_t thread_id = at::RecordFunction::currentThreadId(); Event evt( @@ -274,6 +313,34 @@ struct ProfilerThreadLocalState } private: + std::vector prepareCallstack(const std::vector& cs) { + std::vector entries; + entries.reserve(cs.size()); + for (const auto& entry : cs) { + auto& range = entry.range; + if (range.source()) { + auto& src = range.source(); + if (src && src->filename()) { + auto line = src->starting_line_no() + + src->lineno_for_offset(range.start()); + entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); + } + } + } + return entries; + } + + std::vector callstackStr(const std::vector& cs) { + std::vector cs_str; + cs_str.reserve(cs.size()); + for (const auto& entry : cs) { + std::stringstream loc; + loc << entry.filename << "(" << entry.line << "): " << entry.funcname; + cs_str.push_back(loc.str()); + } + return cs_str; + } + std::string getNvtxStr( const at::StringView& name, const char* msg, @@ -281,8 +348,15 @@ struct ProfilerThreadLocalState const std::vector>& shapes) const { if (sequence_nr >= 0 || shapes.size() > 0) { std::stringstream s; +#ifdef __HIP_PLATFORM_HCC__ + s << name.str(); +#endif if (sequence_nr >= 0) { +#ifdef __HIP_PLATFORM_HCC__ + s << msg << sequence_nr; +#else s << name.str() << msg << sequence_nr; +#endif } if (shapes.size() > 0) { s << ", sizes = ["; @@ -332,7 +406,7 @@ struct ProfilerThreadLocalState std::unordered_map> event_lists_map_; - ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled, false, false); + ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); at::CallbackHandle handle_ = 0; c10::optional>> remoteProfiledEvents_; }; @@ -351,6 +425,11 @@ void pushProfilingCallbacks() { if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; } + bool record_cuda = + state_ptr->config().state == ProfilerState::CUDA; + if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { + record_cuda = false; + } auto* msg = (fn.seqNr() >= 0) ? ", seq = " : ""; if (state_ptr->config().report_input_shapes) { @@ -368,10 +447,9 @@ void pushProfilingCallbacks() { inputSizes.emplace_back(); } } - state_ptr->pushRange( - fn.name(), msg, fn.seqNr(), std::move(inputSizes), fn.handle()); + state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes)); } else { - state_ptr->pushRange(fn.name(), msg, fn.seqNr(), {}, fn.handle()); + state_ptr->pushRange(fn, record_cuda, msg); } }, [](const at::RecordFunction& fn) { @@ -379,7 +457,12 @@ void pushProfilingCallbacks() { if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; } - state_ptr->popRange(fn.getStartCallbacksThreadId(), fn.handle()); + bool record_cuda = + state_ptr->config().state == ProfilerState::CUDA; + if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { + record_cuda = false; + } + state_ptr->popRange(fn, record_cuda); }) .needsInputs(state_ptr->config().report_input_shapes) .needsIds(true)); @@ -388,9 +471,6 @@ void pushProfilingCallbacks() { const int kCUDAWarmupStart = 5; -// temp. workaround for dispatcher ::Profiler key -thread_local std::vector> g_; - } // namespace void registerCUDAMethods(CUDAStubs* stubs) { @@ -445,12 +525,10 @@ void enableProfiler(const ProfilerConfig& new_config) { auto state_ptr = getProfilerTLSState(); TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); - auto state = std::make_shared(new_config); c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); pushProfilingCallbacks(); - g_.emplace_back(std::make_shared()); if (new_config.state == ProfilerState::CUDA) { // event recording appears to have some startup overhead, so we need to @@ -472,22 +550,31 @@ void enableProfiler(const ProfilerConfig& new_config) { state->mark("__start_profile", false); } -thread_event_lists disableProfiler() { +thread_event_lists disableProfiler(c10::optional profilerDisableOptions) { + auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true; + auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true; // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard - auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + std::shared_ptr state; + if (cleanupTLSState) { + state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + } else { + state = c10::ThreadLocalDebugInfo::_peek(c10::DebugInfoKind::PROFILER_STATE); + } + auto state_ptr = static_cast(state.get()); TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled, "Can't disable profiler when it's not running"); - g_.pop_back(); - at::removeCallback(state_ptr->callbackHandle()); + if (cleanupTLSState) { + at::removeCallback(state_ptr->callbackHandle()); + } - if (state_ptr->config().state == ProfilerState::NVTX) { + if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) { return thread_event_lists(); } state_ptr->mark("__stop_profile"); - + // Note that this will erase the underlying events. return state_ptr->consolidate(); } @@ -516,6 +603,30 @@ void Event::record(bool record_cuda) { NUM_EVENT_IVALUE_IDX, " elements to reconstruct Event."); + // Reconstruct input shapes from ivalues. + auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES); + TORCH_INTERNAL_ASSERT( + shapeListIValue.isList(), + "Expected profiler shapes IValue to contain type c10::impl::GenericList." + ); + + auto shapeList = shapeListIValue.toList(); + std::vector> shapes; + shapes.reserve(shapeList.size()); + for (size_t i = 0 ; i < shapeList.size(); ++i) { + std::vector s; + auto shapeIValue = shapeList.get(i); + TORCH_INTERNAL_ASSERT( + shapeIValue.isList(), + "Expected each profiler shape element to contain shapes of type c10::impl::GenericList.") + auto curShapesList = shapeIValue.toList(); + s.reserve(curShapesList.size()); + for (size_t j = 0; j < curShapesList.size(); ++j) { + s.emplace_back(curShapesList.get(j).toInt()); + } + shapes.emplace_back(s); + } + Event evt( static_cast( ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind @@ -523,7 +634,7 @@ void Event::record(bool record_cuda) { ivalues.get(EventIValueIdx::THREAD_ID).toInt(), // thread_id static_cast( ivalues.get(EventIValueIdx::HANDLE).toDouble()), // handle - {}, // TODO: record shapes + std::move(shapes), // input shapes ivalues.get(EventIValueIdx::NODE_ID).toInt(), // node id true, // is remote ivalues.get(EventIValueIdx::CPU_MEM_USAGE).toInt(), // cpu_mem_usage @@ -541,22 +652,35 @@ at::IValue Event::toIValue() const { eventIValueList.reserve(NUM_EVENT_IVALUE_IDX); eventIValueList.emplace_back(static_cast(kind_)); eventIValueList.emplace_back(std::string(name_.str())); - eventIValueList.emplace_back(thread_id_); + eventIValueList.emplace_back(static_cast(thread_id_)); eventIValueList.emplace_back(static_cast(handle_)); eventIValueList.emplace_back(node_id_); eventIValueList.emplace_back(cpu_memory_usage_); eventIValueList.emplace_back(cpu_ns_); // CUDA event information - bool cuda_profiling_enabled = has_cuda(); + bool cuda_profiling_enabled = hasCuda(); eventIValueList.emplace_back(cuda_profiling_enabled); eventIValueList.emplace_back(static_cast(cuda_memory_usage_)); eventIValueList.emplace_back(device_); eventIValueList.emplace_back(cuda_us_); + // Shapes + c10::impl::GenericList shapesList = + c10::impl::GenericList(at::ListType::create(at::IntType::get())); + shapesList.reserve(shapes_.size()); + for (const auto& shape : shapes_) { + c10::impl::GenericList s = c10::impl::GenericList(at::IntType::get()); + s.reserve(shape.size()); + for (const auto& k : shape) { + s.emplace_back(k); + } + shapesList.emplace_back(s); + } + eventIValueList.emplace_back(shapesList); return at::IValue(eventIValueList); } -double Event::cuda_elapsed_us(const Event& e) const { - TORCH_CHECK(e.has_cuda() && has_cuda(), "Events were not recorded for CUDA"); +double Event::cudaElapsedUs(const Event& e) const { + TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA"); TORCH_CHECK( e.device() == device(), c10::str( @@ -605,22 +729,22 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector& e bool first = true; for (Event* evt : events) { if (evt->kind() == "push") { - events_map[std::make_pair(evt->handle(), evt->node_id())] = evt; + events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt; } else if (evt->kind() == "pop") { if (!first) { out << ",\n"; } first = false; - auto it = events_map.find(std::make_pair(evt->handle(), evt->node_id())); + auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId())); TORCH_CHECK(it != events_map.end(), "Unmatched pop event"); Event* evt_start = it->second; events_map.erase(it); jit::TemplateEnv env; env.s("name", evt_start->name()); - env.d("ts", profiler_start->cpu_elapsed_us(*evt_start)); - env.d("dur", evt_start->cpu_elapsed_us(*evt)); - env.d("tid", evt_start->thread_id()); + env.d("ts", profiler_start->cpuElapsedUs(*evt_start)); + env.d("dur", evt_start->cpuElapsedUs(*evt)); + env.d("tid", evt_start->threadId()); out << event_template.format(env); } } @@ -639,10 +763,7 @@ RecordProfile::RecordProfile(const std::string& filename) } void RecordProfile::init() { - enableProfiler(ProfilerConfig( - ProfilerState::CPU, - /* report_input_shapes */ false, - /* profile_memory */ false)); + enableProfiler(ProfilerConfig(ProfilerState::CPU)); } RecordProfile::~RecordProfile() { diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 3f962eff341d..9cfe9ea1fd6e 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -88,6 +88,21 @@ inline int64_t getTime() { #endif } +// A struct to control settings of disableProfiler options. +struct TORCH_API ProfilerDisableOptions { + ProfilerDisableOptions() = default; + ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate) + : cleanupTLSState(shouldCleanupTLSState), + consolidate(shouldConsolidate) {} + // Whether we should clean up profiler states that are thread local, such as + // ThreadLocalDebugInfo and thread local RecordFunction callbacks. + bool cleanupTLSState = true; + // Whether we should consolidate all currently recorded profiled events. If + // false, will not consolidate and other threads can continue to write to the + // event lists. + bool consolidate = true; +}; + enum class C10_API_ENUM ProfilerState { Disabled, CPU, // CPU-only profiling @@ -98,15 +113,18 @@ enum class C10_API_ENUM ProfilerState { struct TORCH_API ProfilerConfig { ProfilerConfig( ProfilerState state, - bool report_input_shapes, - bool profile_memory) + bool report_input_shapes = false, + bool profile_memory = false, + bool with_stack = false) : state(state), report_input_shapes(report_input_shapes), - profile_memory(profile_memory) {} + profile_memory(profile_memory), + with_stack(with_stack) {} ~ProfilerConfig(); ProfilerState state; bool report_input_shapes; bool profile_memory; + bool with_stack; // Returns IValues corresponding to ProfilerConfig struct, to be used for // serialization. @@ -203,24 +221,29 @@ struct TORCH_API Event final { const char* name() const { return name_.str(); } - uint16_t thread_id() const { + + uint64_t threadId() const { return thread_id_; } + std::vector> shapes() const { return shapes_; } - double cpu_elapsed_us(const Event & e) const { + + double cpuElapsedUs(const Event& e) const { return (e.cpu_ns_ - cpu_ns_)/(1000.0); } - double cpu_us() const { + double cpuUs() const { return cpu_ns_ / (1000.0); } - double cuda_elapsed_us(const Event & e) const; - bool has_cuda() const { + double cudaElapsedUs(const Event& e) const; + + bool hasCuda() const { return cuda_event != nullptr || (isRemote() && device_ != -1); } + int device() const { return device_; } @@ -238,11 +261,11 @@ struct TORCH_API Event final { } } - int64_t cpu_memory_usage() const { + int64_t cpuMemoryUsage() const { return cpu_memory_usage_; } - int64_t cuda_memory_usage() const { + int64_t cudaMemoryUsage() const { return cuda_memory_usage_; } @@ -251,7 +274,7 @@ struct TORCH_API Event final { } // Node ID corresponding to this event. - int node_id( ) const { + int nodeId( ) const { return node_id_; } @@ -276,16 +299,41 @@ struct TORCH_API Event final { sequence_nr_ = sequence_nr; } - int64_t sequence_nr() const { + int64_t sequenceNr() const { return sequence_nr_; } + const std::vector& stack() const { + return stack_; + } + + void setStack(const std::vector& stack) { + stack_ = stack; + } + + uint64_t fwdThreadId() const { + return fwd_thread_id_; + } + + void setFwdThreadId(uint64_t fwd_thread_id) { + fwd_thread_id_ = fwd_thread_id; + } + + uint8_t scope() const { + return scope_; + } + + void setScope(uint8_t scope) { + scope_ = scope; + } + private: // signed to allow for negative intervals, initialized for safety. int64_t cpu_ns_ = 0; at::StringView name_; EventKind kind_; - uint16_t thread_id_; + uint64_t thread_id_; + uint64_t fwd_thread_id_; at::RecordFunctionHandle handle_ {0}; std::vector> shapes_; int64_t cpu_memory_usage_ = 0; @@ -296,6 +344,9 @@ struct TORCH_API Event final { bool is_remote_ = false; int64_t cuda_us_ = -1; int64_t sequence_nr_ = -1; + + std::vector stack_; + uint8_t scope_; }; // a linked-list of fixed sized vectors, to avoid @@ -341,7 +392,7 @@ using thread_event_lists = std::vector>; // NOTE: profiler mode is thread local, with automatic propagation // across thread boundary (e.g. at::launch tasks) TORCH_API void enableProfiler(const ProfilerConfig&); -TORCH_API thread_event_lists disableProfiler(); +TORCH_API thread_event_lists disableProfiler(c10::optional profilerDisableOptions = c10::nullopt); // adds profiledEvents to the current thread local recorded events. Each event // will be marked with node ID given by fromNodeId. TORCH_API void addEventList(std::vector&& profiledEvents); @@ -383,19 +434,27 @@ struct TORCH_API TLSProfilerGuard { explicit TLSProfilerGuard( const ProfilerConfig& cfg, c10::optional> - resultCallback = c10::nullopt) - : cb_(std::move(resultCallback)) { + resultCallback = c10::nullopt, + c10::optional profilerDisableOptions = + c10::nullopt) + : cb_(std::move(resultCallback)), + profilerDisableOptions_(std::move(profilerDisableOptions)) { enableProfiler(cfg); } ~TLSProfilerGuard() { - thread_event_lists event_lists = disableProfiler(); + thread_event_lists event_lists = disableProfiler(profilerDisableOptions_); if (cb_) { - (*cb_)(event_lists); + try { + (*cb_)(event_lists); + } catch (const std::exception& e) { + LOG(ERROR) << "Got error processing profiler events: " << e.what(); + } } } private: c10::optional> cb_; + const c10::optional profilerDisableOptions_; }; } // namespace profiler diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp index f4c88225efc8..586e956a8549 100644 --- a/torch/csrc/autograd/python_engine.cpp +++ b/torch/csrc/autograd/python_engine.cpp @@ -167,10 +167,6 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar "vmapped tensors (output ", i, " is being vmapped over). Please " "call autograd.grad() outside torch.vmap or file a bug report " "with your use case.") - if(variable.is_complex()) { - TORCH_WARN_ONCE("Complex backward is not fully supported yet and could lead to wrong ", - "gradients for functions we have not fixed yet"); - } auto gradient_edge = torch::autograd::impl::gradient_edge(variable); THPUtils_assert(gradient_edge.function, "element %d of tensors does not require grad and does not have a grad_fn", i); diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 81e10a9a1d1b..28f9c3880d88 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -388,19 +388,19 @@ PyObject *THPVariable_get_ndim(THPVariable *self, void *unused) END_HANDLE_TH_ERRORS } -PyObject *THPVariable_get_names(THPVariable *self, void *unused) +PyObject *THPVariable_get_names(PyObject *self, void *unused) { HANDLE_TH_ERRORS - if (check_has_torch_function((PyObject *)self)) { - return handle_torch_function_getter(self, "names"); + if (check_has_torch_function(self)) { + return handle_torch_function_getter((THPVariable*)self, "names"); } // The long-term plan is to return a list of (python) torch.Dimname. // However, for now, return a list of string. - size_t size = self->cdata.dim(); + size_t size = ((THPVariable *)self)->cdata.dim(); THPObjectPtr tuple(PyTuple_New(size)); if (!tuple) throw python_error(); - const auto dimnames = self->cdata.names(); + const auto dimnames = ((THPVariable *)self)->cdata.names(); for (size_t i = 0; i < size; ++i) { PyObject* str; if (dimnames[i].type() == at::NameType::WILDCARD) { @@ -423,12 +423,12 @@ PyObject *THPVariable_get_names(THPVariable *self, void *unused) END_HANDLE_TH_ERRORS } -int THPVariable_set_names(THPVariable *self, PyObject *names) { +int THPVariable_set_names(PyObject *self, PyObject *names) { HANDLE_TH_ERRORS - if (check_has_torch_function((PyObject *)self)) { - return handle_torch_function_setter(self, "names", names); + if (check_has_torch_function(self)) { + return handle_torch_function_setter((THPVariable*)self, "names", names); } - auto& var = self->cdata; + auto& var = ((THPVariable *)self)->cdata; if (names == Py_None) { at::internal_set_names_inplace(var, at::nullopt); } else { diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp index 403bcb2b85da..35dbeae3f3aa 100644 --- a/torch/csrc/cuda/python_nccl.cpp +++ b/torch/csrc/cuda/python_nccl.cpp @@ -199,7 +199,9 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) { nullptr, "nccl_broadcast", 1, - "(sequence[Tensor] inputs, int root)"); + "(sequence[Tensor] inputs, int root" + " sequence[torch.cuda.Stream] streams," + " sequence[torch.cuda.nccl.Communicator] comms)"); return nullptr; } @@ -228,7 +230,9 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) { nullptr, "nccl_all_gather", 1, - "(sequence[Tensor] inputs, sequence[Tensor] outputs"); + "(sequence[Tensor] inputs, sequence[Tensor] outputs" + " sequence[torch.cuda.Stream] streams," + " sequence[torch.cuda.nccl.Communicator] comms)"); return nullptr; } @@ -258,7 +262,9 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) { nullptr, "nccl_reduce_scatter", 1, - "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op"); + "(sequence[Tensor] inputs, sequence[Tensor] outputs, int op" + " sequence[torch.cuda.Stream] streams," + " sequence[torch.cuda.nccl.Communicator] comms)"); return nullptr; } diff --git a/torch/csrc/cuda/shared/cudart.cpp b/torch/csrc/cuda/shared/cudart.cpp index efada16a49c8..a8f80a35855d 100644 --- a/torch/csrc/cuda/shared/cudart.cpp +++ b/torch/csrc/cuda/shared/cudart.cpp @@ -29,7 +29,12 @@ void initCudartBindings(PyObject* module) { cudart.def("cuda" "GetErrorString", cudaGetErrorString); cudart.def("cuda" "ProfilerStart", cudaProfilerStart); cudart.def("cuda" "ProfilerStop", cudaProfilerStop); - cudart.def("cuda" "HostRegister", cudaHostRegister); + cudart.def("cuda" "HostRegister", [](uintptr_t ptr, size_t size, unsigned int flags) -> cudaError_t { + return cudaHostRegister((void*)ptr, size, flags); + }); + cudart.def("cuda" "HostUnregister", [](uintptr_t ptr) -> cudaError_t { + return cudaHostUnregister((void*)ptr); + }); #ifndef __HIP_PLATFORM_HCC__ cudart.def("cuda" "ProfilerInitialize", cudaProfilerInitialize); #endif diff --git a/torch/csrc/distributed/autograd/init.cpp b/torch/csrc/distributed/autograd/init.cpp index 9ab16fb6a93c..09de7abb87a5 100644 --- a/torch/csrc/distributed/autograd/init.cpp +++ b/torch/csrc/distributed/autograd/init.cpp @@ -15,7 +15,7 @@ namespace { template using shared_ptr_class_ = py::class_>; -PyObject* dist_autograd_init(PyObject* /* unused */) { +PyObject* dist_autograd_init(PyObject* _unused, PyObject* noargs) { auto autograd_module = THPObjectPtr(PyImport_ImportModule("torch.distributed.autograd")); if (!autograd_module) { @@ -196,7 +196,7 @@ Example:: static PyMethodDef methods[] = { // NOLINT {"_dist_autograd_init", - (PyCFunction)dist_autograd_init, + dist_autograd_init, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr}}; diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp index 726cc605a913..464d8248d8a4 100644 --- a/torch/csrc/distributed/autograd/utils.cpp +++ b/torch/csrc/distributed/autograd/utils.cpp @@ -143,7 +143,8 @@ std::shared_ptr sendMessageWithAutograd( const WorkerInfo& dst, torch::distributed::rpc::Message&& wrappedRpcMsg, bool forceGradRecording, - const float rpcTimeoutSeconds) { + const float rpcTimeoutSeconds, + bool forceDisableProfiling) { auto msg = getMessageWithAutograd( dst.id_, std::move(wrappedRpcMsg), @@ -153,7 +154,7 @@ std::shared_ptr sendMessageWithAutograd( std::shared_ptr fut; // If profiler is enabled, wrap this message with profiling metadata that will // tell the remote end to process this request with the profiler enabled. - if (torch::autograd::profiler::profilerEnabled()) { + if (!forceDisableProfiling && torch::autograd::profiler::profilerEnabled()) { auto profilerConfig = torch::autograd::profiler::getProfilerConfig(); auto msgWithProfiling = getMessageWithProfiling( std::move(msg), diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h index c6316378a146..2a0a066e1a95 100644 --- a/torch/csrc/distributed/autograd/utils.h +++ b/torch/csrc/distributed/autograd/utils.h @@ -51,7 +51,8 @@ sendMessageWithAutograd( const rpc::WorkerInfo& dst, rpc::Message&& wrappedRpcMsg, bool forceGradRecording = false, - const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout); + const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout, + bool forceDisableProfiling = false); } // namespace autograd } // namespace distributed diff --git a/torch/csrc/distributed/c10d/c10d_frontend.h b/torch/csrc/distributed/c10d/c10d_frontend.h new file mode 100644 index 000000000000..9ff4b69999c7 --- /dev/null +++ b/torch/csrc/distributed/c10d/c10d_frontend.h @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10d { + +class Backend { + public: + // Maps to Backend.__new__ in Python. + static std::string get(std::string); + + // TODO: How to support registering third_party backend? + static void registerBackend(); + + private: + // TODO: Should this be an enum list instead since this set doesn't + // change at all. + std::unordered_set registered_backends_; +}; + +class DistributedC10d{ + public: + void initProcessGroup( + const std::string& backend, + const std::string& init_method, + const std::chrono::milliseconds& timeout, + int64_t world_size, + int64_t rank, + std::shared_ptr store, + const std::string& group_name); + + void destroyProcessGroup(std::shared_ptr group); + int64_t getRank(std::shared_ptr group); + int64_t getWorldSize(std::shared_ptr group); + + ProcessGroup::Work isend(at::Tensor tensor, int64_t dst, std::shared_ptr group, c10::optional tag); + ProcessGroup::Work irecv(at::Tensor tensor, int64_t src, std::shared_ptr group, c10::optional tag); + + private: + DistributedC10d(){}; + + bool rankNotInGroup(std::shared_ptr group) const; + int64_t getGroupRank( + std::shared_ptr group, + const int64_t rank) const; + int64_t getGlobalRank( + std::shared_ptr group, + const int64_t global_rank) const; + void checkDefaultPg() const; + int64_t getGroupSize(std::shared_ptr group) const; + int64_t getBackend(std::shared_ptr group); + + std::string backend_; + // TODO: Ask Alex what kind of equality we need. It determine whether we + // need to use ProcessGroup or ProcesGroup* as key. + std::unordered_map< + std::shared_ptr, + std::pair, std::shared_ptr>> + pg_map_; + + // Note, this is different mapping relationship than original Python + // implementation. + std::unordered_map, std::string> pg_names_; + + // Value is global_rank:group_rank mapping. + std::unordered_map, std::vector> + pg_group_ranks_; + + std::shared_ptr default_pg_; + + // Default value should be "env://" + std::string default_pg_init_method_; + + int64_t group_count_; +}; + + +} // namespace c10d diff --git a/torch/csrc/distributed/c10d/comm.h b/torch/csrc/distributed/c10d/comm.h index e2b501f08aff..2eb626c40232 100644 --- a/torch/csrc/distributed/c10d/comm.h +++ b/torch/csrc/distributed/c10d/comm.h @@ -38,7 +38,7 @@ class GradBucket { // DDP's c10d reducer allows communication hooks defined as a sub class // of CommHookInterface. CommHookInterface is an abstract class and can // be used to implement both Python and CPP hooks. -struct TORCH_API CommHookInterface { +struct TORCH_PYTHON_API CommHookInterface { public: virtual ~CommHookInterface() {} @@ -59,7 +59,7 @@ struct TORCH_API CommHookInterface { // PythonCommHook enables registering a python hook to c10d reducer and is a // sub class of CommHookInterface. -class TORCH_API PythonCommHook : public CommHookInterface { +class TORCH_PYTHON_API PythonCommHook : public CommHookInterface { public: // The constructor takes a state and a callable hook. Inputs are Python // objects. The state is passed to the hook in runHook function can be used to diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index aff2da31c133..d15ea9d23412 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1,7 +1,11 @@ #include #include +#ifndef _WIN32 #include +#include +#include +#endif #include #ifdef USE_C10D_GLOO @@ -17,8 +21,6 @@ #endif #include -#include -#include #include #include @@ -92,6 +94,14 @@ class PythonStore : public ::c10d::Store { PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, add, key, value); } + int64_t getNumKeys() override { + PYBIND11_OVERLOAD_PURE(int64_t, ::c10d::Store, getNumKeys); + } + + bool deleteKey(const std::string& key) override { + PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, deleteKey, key); + } + bool check(const std::vector& keys) override { PYBIND11_OVERLOAD_PURE(bool, ::c10d::Store, check, keys); } @@ -121,7 +131,7 @@ void _register_comm_hook( std::move(state), std::move(comm_hook))); }; -PyObject* c10d_init(PyObject* _unused) { +PyObject* c10d_init(PyObject* _unused, PyObject* noargs) { C10_LOG_API_USAGE_ONCE("c10d.python.import"); auto c10d_module = THPObjectPtr(PyImport_ImportModule("torch.distributed")); if (!c10d_module) { @@ -159,6 +169,7 @@ PyObject* c10d_init(PyObject* _unused) { std::shared_ptr<::c10d::ProcessGroup>, std::vector>, int64_t, + bool, bool>(), py::arg("replicas"), py::arg("bucket_indices"), @@ -166,6 +177,7 @@ PyObject* c10d_init(PyObject* _unused) { py::arg("expect_sparse_gradients") = std::vector>(), py::arg("bucket_bytes_cap") = ::c10d::kDefaultBucketBytesCap, py::arg("find_unused_parameters") = false, + py::arg("gradient_as_bucket_view") = false, py::call_guard()) .def( "initialize_buckets", @@ -272,7 +284,12 @@ They are used in specifying strategies for reduction collectives, e.g., auto store = py::class_<::c10d::Store, std::shared_ptr<::c10d::Store>, PythonStore>( - module, "Store") + module, "Store", + R"( +Base class for all store implementations, such as the 3 provided by PyTorch +distributed: (:class:`~torch.distributed.TCPStore`, :class:`~torch.distributed.FileStore`, +and :class:`~torch.distributed.HashStore`). +)") // Default constructor. .def(py::init<>()) // Convert from std::string to std::vector. @@ -284,7 +301,23 @@ They are used in specifying strategies for reduction collectives, e.g., std::vector value_(value.begin(), value.end()); store.set(key, value_); }, - py::call_guard()) + py::call_guard(), + R"( +Inserts the key-value pair into the store based on the supplied ``key`` and +``value``. If ``key`` already exists in the store, it will overwrite the old +value with the new supplied ``value``. + +Arguments: + key (str): The key to be added to the store. + value (str): The value associated with ``key`` to be added to the store. + +Example:: + >>> import torch.distributed as dist + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> store.set("first_key", "first_value") + >>> # Should return "first_value" + >>> store.get("first_key") +)") // Convert from std::vector to py::bytes. // The returned value is not guaranteed to be valid UTF-8. .def( @@ -294,21 +327,141 @@ They are used in specifying strategies for reduction collectives, e.g., return py::bytes( reinterpret_cast(value.data()), value.size()); }, - py::call_guard()) + py::call_guard(), + R"( +Retrieves the value associated with the given ``key`` in the store. If ``key`` is not +present in the store, the function will wait for ``timeout``, which is defined +when initializing the store, before throwing an exception. + +Arguments: + key (str): The function will return the value associated with this key. + +Returns: + Value associated with ``key`` if ``key`` is in the store. + +Example:: + >>> import torch.distributed as dist + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> store.set("first_key", "first_value") + >>> # Should return "first_value" + >>> store.get("first_key") +)") .def( "add", &::c10d::Store::add, - py::call_guard()) + py::call_guard(), + R"( +The first call to add for a given ``key`` creates a counter associated +with ``key`` in the store, initialized to ``amount``. Subsequent calls to add +with the same ``key`` increment the counter by the specified ``amount``. +Calling :meth:`~torch.distributed.store.add` with a key that has already +been set in the store by :meth:`~torch.distributed.store.set` will result +in an exception. + +Arguments: + key (str): The key in the store whose counter will be incremented. + amount (int): The quantity by which the counter will be incremented. + +Example:: + >>> import torch.distributed as dist + >>> # Using TCPStore as an example, other store types can also be used + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> store.add("first_key", 1) + >>> store.add("first_key", 6) + >>> # Should return 7 + >>> store.get("first_key") +)") + .def( + "delete_key", + &::c10d::Store::deleteKey, + py::call_guard(), + R"( +Deletes the key-value pair associated with ``key`` from the store. Returns +`true` if the key was successfully deleted, and `false` if it was not. + +.. warning:: + The ``delete_key`` API is only supported by the :class:`~torch.distributed.TCPStore`. Using this API + with the :class:`~torch.distributed.FileStore` or :class:`~torch.distributed.HashStore` will result in an exception. + +Arguments: + key (str): The key to be deleted from the store + +Returns: + `true` if ``key`` was deleted, otherwise `false`. + +Example:: + >>> import torch.distributed as dist + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> store.set("first_key") + >>> # This should return true + >>> store.delete_key("first_key") + >>> # This should return false + >>> store.delete_key("bad_key") +)") + .def( + "num_keys", + &::c10d::Store::getNumKeys, + py::call_guard(), + R"( +Returns the number of keys set in the store. Note that this number will typically +be one greater than the number of keys added by :meth:`~torch.distributed.store.set` +and :meth:`~torch.distributed.store.add` since one key is used to coordinate all +the workers using the store. + +.. warning:: + The ``num_keys`` API is only supported by the :class:`~torch.distributed.TCPStore`. Using this API + with the :class:`~torch.distributed.FileStore` or :class:`~torch.distributed.HashStore` will result in an exception. + +Returns: + The number of keys present in the store. + +Example:: + >>> import torch.distributed as dist + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> store.set("first_key", "first_value") + >>> # This should return 2 + >>> store.num_keys() +)") .def( "set_timeout", &::c10d::Store::setTimeout, - py::call_guard()) + py::call_guard(), + R"( +Sets the store's default timeout. This timeout is used during initialization and in +:meth:`~torch.distributed.store.wait` and :meth:`~torch.distributed.store.get`. + +Arguments: + timeout (timedelta): timeout to be set in the store. + +Example:: + >>> import torch.distributed as dist + >>> # Using TCPStore as an example, other store types can also be used + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> store.set_timeout(timedelta(seconds=10)) + >>> # This will throw an exception after 10 seconds + >>> store.wait(["bad_key"]) +)") .def( "wait", [](::c10d::Store& store, const std::vector& keys) { store.wait(keys); }, - py::call_guard()) + py::call_guard(), + R"( +Waits for each key in ``keys`` to be added to the store. If not all keys are +set before the ``timeout`` (set during store initialization), then ``wait`` +will throw an exception. + +Arguments: + keys (list): List of keys on which to wait until they are set in the store. + +Example:: + >>> import torch.distributed as dist + >>> # Using TCPStore as an example, other store types can also be used + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> # This will throw an exception after 30 seconds + >>> store.wait(["bad_key"]) +)") .def( "wait", [](::c10d::Store& store, @@ -316,15 +469,79 @@ They are used in specifying strategies for reduction collectives, e.g., const std::chrono::milliseconds& timeout) { store.wait(keys, timeout); }, - py::call_guard()); - - shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store) + py::call_guard(), + R"( +Waits for each key in ``keys`` to be added to the store, and throws an exception +if the keys have not been set by the supplied ``timeout``. + +Arguments: + keys (list): List of keys on which to wait until they are set in the store. + timeout (timedelta): Time to wait for the keys to be added before throwing an exception. + +Example:: + >>> import torch.distributed as dist + >>> # Using TCPStore as an example, other store types can also be used + >>> store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> # This will throw an exception after 10 seconds + >>> store.wait(["bad_key"], timedelta(seconds=10)) +)"); + + shared_ptr_class_<::c10d::FileStore>(module, "FileStore", store, + R"( +A store implementation that uses a file to store the underlying key-value pairs. + +Arguments: + file_name (str): path of the file in which to store the key-value pairs + world_size (int): The total number of processes using the store + +Example:: + >>> import torch.distributed as dist + >>> store1 = dist.FileStore("/tmp/filestore", 2) + >>> store2 = dist.FileStore("/tmp/filestore", 2) + >>> # Use any of the store methods from either the client or server after initialization + >>> store1.set("first_key", "first_value") + >>> store2.get("first_key") + + )") .def(py::init()); - shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store) +#ifndef _WIN32 + shared_ptr_class_<::c10d::HashStore>(module, "HashStore", store, + R"( +A thread-safe store implementation based on an underlying hashmap. This store can be used +within the same process (for example, by other threads), but cannot be used across processes. + +Example:: + >>> import torch.distributed as dist + >>> store = dist.HashStore() + >>> # store can be used from other threads + >>> # Use any of the store methods after initialization + >>> store.set("first_key", "first_value") + )") .def(py::init<>()); - shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store) + shared_ptr_class_<::c10d::TCPStore>(module, "TCPStore", store, + R"( +A TCP-based distributed key-value store implementation. The server store holds +the data, while the client stores can connect to the server store over TCP and +perform actions such as :meth:`~torch.distributed.store.set` to insert a key-value +pair, :meth:`~torch.distributed.store.get` to retrieve a key-value pair, etc. + +Arguments: + host_name (str): The hostname or IP Address the server store should run on. + port (int): The port on which the server store should listen for incoming requests. + world_size (int): The total number of store users (number of clients + 1 for the server). + is_master (bool): True when initializing the server store, False for client stores. + timeout (timedelta): Timeout used by the store during initialization and for methods such as :meth:`~torch.distributed.store.get` and :meth:`~torch.distributed.store.wait`. + +Example:: + >>> import torch.distributed as dist + >>> server_store = dist.TCPStore("127.0.0.1", 0, true, timedelta(seconds=30)) + >>> client_store = dist.TCPStore("127.0.0.1", 0, false) + >>> # Use any of the store methods from either the client or server after initialization + >>> server_store.set("first_key", "first_value") + >>> client_store.get("first_key") + )") .def( py::init< const std::string&, @@ -338,8 +555,18 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("is_master"), py::arg("timeout") = std::chrono::milliseconds(::c10d::Store::kDefaultTimeout)); +#endif - shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store) + shared_ptr_class_<::c10d::PrefixStore>(module, "PrefixStore", store, + R"( +A wrapper around any of the 3 key-value stores (:class:`~torch.distributed.TCPStore`, +:class:`~torch.distributed.FileStore`, and :class:`~torch.distributed.HashStore`) +that adds a prefix to each key inserted to the store. + +Arguments: + prefix (str): The prefix string that is prepended to each key before being inserted into the store. + store (torch.distributed.store): A store object that forms the underlying key-value store. + )") .def(py::init>()); auto processGroup = @@ -605,6 +832,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::arg("opts") = ::c10d::BarrierOptions(), py::call_guard()); +#ifndef _WIN32 module.def( "_round_robin_process_groups", [](std::vector> processGroups) @@ -618,6 +846,7 @@ They are used in specifying strategies for reduction collectives, e.g., }, py::arg("process_groups"), py::call_guard()); +#endif #ifdef USE_C10D_GLOO auto processGroupGloo = shared_ptr_class_<::c10d::ProcessGroupGloo>( @@ -720,6 +949,12 @@ They are used in specifying strategies for reduction collectives, e.g., .def(py::init<>()) .def_readwrite("is_high_priority", &::c10d::ProcessGroupNCCL::Options::isHighPriorityStream) .def_readwrite("op_timeout", &::c10d::ProcessGroupNCCL::Options::opTimeout); + processGroupNCCL.def_static("_group_start", []() { + ::c10d::ProcessGroupNCCL::groupStart(); + }); + processGroupNCCL.def_static("_group_end", []() { + ::c10d::ProcessGroupNCCL::groupEnd(); + }); #endif #ifdef USE_C10D_MPI diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index ac4e735af94a..90128e48ee1d 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -32,7 +32,8 @@ Reducer::Reducer( std::shared_ptr process_group, std::vector> expect_sparse_gradients, int64_t bucket_bytes_cap, - bool find_unused_parameters) + bool find_unused_parameters, + bool gradient_as_bucket_view) : replicas_(std::move(replicas)), process_group_(std::move(process_group)), expect_sparse_gradients_(std::move(expect_sparse_gradients)), @@ -41,6 +42,7 @@ Reducer::Reducer( next_bucket_(0), has_marked_unused_parameters_(false), find_unused_parameters_(find_unused_parameters), + gradient_as_bucket_view_(gradient_as_bucket_view), local_used_maps_reduced_(false), backward_stats_base_(0), has_rebuilt_bucket_(false), @@ -87,10 +89,7 @@ Reducer::Reducer( for (size_t variable_index = 0; variable_index < variable_count; variable_index++) { auto& variable = replicas_[replica_index][variable_index]; - const auto index = VariableIndex{ - .replica_index = replica_index, - .variable_index = variable_index, - }; + const auto index = VariableIndex(replica_index, variable_index); // The gradient accumulator function is lazily initialized once. // Therefore we can use its presence in the autograd graph as @@ -98,15 +97,19 @@ Reducer::Reducer( auto grad_accumulator = torch::autograd::impl::grad_accumulator(variable); +#ifndef _WIN32 using torch::distributed::autograd::ThreadLocalDistAutogradContext; +#endif // Hook to execute after the gradient accumulator has executed. hooks_.emplace_back( grad_accumulator->add_post_hook( torch::make_unique( [=](const torch::autograd::variable_list& outputs, const torch::autograd::variable_list& /* unused */) { +#ifndef _WIN32 this->rpc_context_.set( ThreadLocalDistAutogradContext::getContextPtr()); +#endif this->autograd_hook(index); return outputs; })), @@ -190,7 +193,7 @@ Reducer::Reducer( // used to override how DDP communicates gradients across ranks, this can be // used for algorithms like Gradient Compression/GossipGrad. This hook can be // registered from Python API using `register_comm_hook`. `PythonCommHook` -// enables registering a Python hook and is a sub class of `CommHookInterface`. +// enables registering a Python hook and is a subclass of `CommHookInterface`. // `CommHookInterface` can be used to implement CPP hooks in the future. Reducer::~Reducer() noexcept(false) { @@ -310,6 +313,56 @@ void Reducer::verify_replica0_across_processes() { } } +void Reducer::check_grad_layout( + const at::Tensor& grad, + const at::Tensor& bucket_view) { + // Ensure that the gradient type matches the bucket type. + TORCH_CHECK( + grad.options().type_equal(bucket_view.options()), + "Expected ", + bucket_view.toString(), + ", got ", + grad.toString()); + TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device()); + TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel()); + // AccumulateGrad doesn't HAVE to obey the grad layout contract. + // The penalty for disobedience is reduced performance, not numerical + // death. Warnings here help diagnose poor DDP performance. + if (grad.strides() != bucket_view.strides()) { + TORCH_WARN_ONCE( + "Grad strides do not match bucket view strides. " + "This may indicate grad was not created according to the " + "gradient layout contract, or that the param's strides " + "changed since DDP was constructed. This is not an error, " + "but may impair performance.\n" + "grad.sizes() = ", + grad.sizes(), + ", strides() = ", + grad.strides(), + "\n", + "bucket_view.sizes() = ", + bucket_view.sizes(), + ", strides() = ", + bucket_view.strides()); + } + if (!gradient_as_bucket_view_) { + TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view)); + } +} + +void Reducer::copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view) { + // See Note [DDP Communication Hook] + if (comm_hook_ == nullptr) { + // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp + auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_); + wrapped.unsafeGetTensorImpl()->set_wrapped_number(true); + // Divides while copying into the bucket view. + at::native::mul_out(bucket_view, grad, wrapped); + } else { + bucket_view.copy_(grad); + } +} + void Reducer::mark_variable_ready_dense(VariableIndex index) { const auto replica_index = index.replica_index; const auto variable_index = index.variable_index; @@ -327,49 +380,27 @@ void Reducer::mark_variable_ready_dense(VariableIndex index) { // of the bucket it would otherwise hold. runGradCallbackForVariable(variable, [&](auto& grad) { if (grad.defined()) { - // Ensure that the gradient type matches the bucket type. - TORCH_CHECK( - grad.options().type_equal(bucket_view.options()), - "Expected ", - bucket_view.toString(), - ", got ", - grad.toString()); - // Assert that the grad tensor and the bucket don't share storage. - // If they did, we could avoid the copy altogether. - // The reason for not doing this is that existing code calls - // `detach_` from `zero_grad`, which is incompatible with views. - TORCH_INTERNAL_ASSERT(!grad.is_alias_of(bucket_view)); - TORCH_INTERNAL_ASSERT(grad.device() == bucket_view.device()); - TORCH_INTERNAL_ASSERT(grad.numel() == bucket_view.numel()); - // AccumulateGrad doesn't HAVE to obey the grad layout contract. - // The penalty for disobedience is reduced performance, not numerical - // death. Warnings here help diagnose poor DDP performance. - if (grad.strides() != bucket_view.strides()) { - TORCH_WARN_ONCE( - "Grad strides do not match bucket view strides. " - "This may indicate grad was not created according to the " - "gradient layout contract, or that the param's strides " - "changed since DDP was constructed. This is not an error, " - "but may impair performance.\n" - "grad.sizes() = ", - grad.sizes(), - ", strides() = ", - grad.strides(), - "\n", - "bucket_view.sizes() = ", - bucket_view.sizes(), - ", strides() = ", - bucket_view.strides()); - } - // See Note [DDP Communication Hook] - if (comm_hook_ == nullptr) { - // imitates wrapped_scalar_tensor in ATen/native/BinaryOps.cpp - auto wrapped = c10::scalar_to_tensor(double(1.) / divFactor_); - wrapped.unsafeGetTensorImpl()->set_wrapped_number(true); - // Divides while copying into the bucket view. - at::native::mul_out(bucket_view, grad, wrapped); + this->check_grad_layout(grad, bucket_view); + // When gradient_as_bucket_view_ is false, or even when + // gradient_as_bucket_view_ is true, in rare cases users may set grad to + // be None after every iteration. In these cases, grad and bucket_view are + // pointing to different storages and thus need to copy grads to + // bucket_view. If gradient_as_bucket_view_ is set as true, let grad point + // to bucket_view. If grad has already been set as views of buckets in + // previous iterations, no copy is needed. + if (!grad.is_alias_of(bucket_view)) { + this->copy_grad_to_bucket(grad, bucket_view); + if (gradient_as_bucket_view_) { + // Let grad point to bucket_view buffer. + grad = bucket_view; + // The grad is modified and need to be written back. + return true; + } } else { - bucket_view.copy_(grad); + // If grad and bucket view point to the same storage, no need to copy + if (comm_hook_ == nullptr) { + bucket_view.div_(divFactor_); + } } } else { bucket_view.zero_(); @@ -425,11 +456,9 @@ std::vector> Reducer::get_bucket_tensors() const { void Reducer::set_forward_pass_work_handle( std::shared_ptr forwardPassWorkHandle, - at::Tensor& tensor, bool useStaticWorldSize) { std::lock_guard lock(mutex_); forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle); - forwardPassWorkHandle_.resultTensor = tensor; forwardPassWorkHandle_.useStaticWorldSize = useStaticWorldSize; } @@ -449,10 +478,7 @@ void Reducer::push_rebuilt_params_for_all_indices() { const auto variable_count = replicas_[replica_index].size(); for (size_t variable_index = 0; variable_index < variable_count; ++variable_index) { - const auto index = VariableIndex{ - .replica_index = replica_index, - .variable_index = variable_index, - }; + const auto index = VariableIndex(replica_index, variable_index); push_rebuilt_params(index); } } @@ -495,7 +521,7 @@ void Reducer::autograd_hook(VariableIndex index) { // rebuilt_param_indices_ based on gradient arriving order, and then at the // end of finalize_backward(), buckets will be rebuilt based on // rebuilt_params_ and rebuilt_param_indices_, and then will be broadcasted - // and initialized. Also we only need to dump tensors and parameter indcies of + // and initialized. Also we only need to dump tensors and parameter indices of // one replica. push_rebuilt_params(index); @@ -573,12 +599,13 @@ void Reducer::mark_variable_ready(VariableIndex index) { if (divFactor_ == kUnsetDivFactor) { divFactor_ = process_group_->getSize(); auto& workHandle = forwardPassWorkHandle_.workHandle; - if (workHandle) { - if (!forwardPassWorkHandle_.useStaticWorldSize) { - workHandle->wait(); - at::Tensor& res = forwardPassWorkHandle_.resultTensor; - divFactor_ = res.item().to(); - } + if (workHandle && !forwardPassWorkHandle_.useStaticWorldSize) { + workHandle->wait(); + auto results = workHandle->result(); + // Guard against the results being empty + TORCH_INTERNAL_ASSERT(results.size() > 0); + at::Tensor& res = results.front(); + divFactor_ = res.item().to(); } } @@ -675,6 +702,19 @@ void Reducer::mark_bucket_ready(size_t bucket_index) { void Reducer::initialize_buckets( std::vector> bucket_indices) { + // If initialize_buckets is called inside DDP constructor, then + // it does not matter rpc context ptr is nullptr or not, as grad + // will not be mutated. + // If initialize_buckets is called during training loop, e.g, inside + // rebuild_buckets(), since grad could be mutated and be pointed to + // bucket_view, then it needs to check rpc context ptr is nullptr or not, + // If rpc context ptr is nullptr, mutate variable.grad(); otherwise, + // mutate grad in rpc context. +#ifndef _WIN32 + using torch::distributed::autograd::ThreadLocalDistAutogradContext; + this->rpc_context_.set(ThreadLocalDistAutogradContext::getContextPtr()); +#endif + // This shouldn't be called if we're expecting autograd hooks to fire. TORCH_CHECK( !expect_autograd_hooks_, @@ -810,10 +850,8 @@ void Reducer::initialize_buckets( TORCH_CHECK( variable_index < variable_locators_.size(), "Out of range variable index specified."); - variable_locators_[variable_index] = VariableLocator{ - .bucket_index = bucket_index, - .intra_bucket_index = intra_bucket_index++, - }; + variable_locators_[variable_index] = VariableLocator( + bucket_index, intra_bucket_index++); } bucket.variable_indices = std::move(bucket_indices[bucket_index]); @@ -826,7 +864,7 @@ void Reducer::initialize_bucket_views( Reducer::BucketReplica& replica, at::Tensor& contents) { for (size_t i = 0; i < replica.variables.size(); i++) { - const auto& v = replica.variables[i]; + auto& v = replica.variables[i]; const auto offset = replica.offsets[i]; const auto length = replica.lengths[i]; if (v.is_non_overlapping_and_dense()) { @@ -845,6 +883,29 @@ void Reducer::initialize_bucket_views( // By default `bucket_views_out` and `bucket_views_in` are // essentially the same thing. replica.bucket_views_out = replica.bucket_views_in; + + // If gradient_as_bucket_view_ is set as true, then there are two cases to + // handle: initialize_bucket_views could be called inside initialize_buckets + // when rebuild_buckets, if grad has already been defined/calculated in + // previous iteration, old grad needs to be copied into new bucket_view and + // let grad point to the new bucket_view, initialize_bucket_views could also + // be called inside initialize_buckets during construction. Grads are not + // defined during construction time, in this case, do not let grad point to + // bucket_view, because grads should be kept as being undefined for globally + // unused parameters. + if (gradient_as_bucket_view_) { + auto& bucket_view = replica.bucket_views_in.back(); + runGradCallbackForVariable(v, [&](auto& grad) { + if (grad.defined() && !grad.is_alias_of(bucket_view)) { + bucket_view.copy_(grad); + grad = bucket_view; + // The grad is modefied and needs to be written back. + return true; + } + // The grad is not modified and does not need to be written back. + return false; + }); + } } } @@ -966,6 +1027,31 @@ void Reducer::prepare_for_backward( } } +void Reducer::copy_bucket_to_grad( + torch::autograd::Variable& variable, + Reducer::BucketReplica& replica, + size_t intra_bucket_index, + bool global_unused) { + const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; + runGradCallbackForVariable(variable, [&](auto& grad) { + // If a parameter is globally unused, we keep its grad untouched. + if (!global_unused) { + if (!grad.defined()) { + // Creates grad according to the "Gradient Layout Contract" + // (see torch/csrc/grad/AccumulateGrad.h) + grad = + torch::autograd::utils::clone_obey_contract(bucket_view, variable); + } else { + grad.copy_(bucket_view); + } + // The grad is modified and needs to be written back. + return true; + } + // The grad is not modified. + return false; + }); +} + // A bucket with one or more dense tensors needs to be unflattened. void Reducer::finalize_bucket_dense(Bucket& bucket) { for (size_t replica_index = 0; replica_index < bucket.replicas.size(); @@ -1016,24 +1102,52 @@ void Reducer::finalize_bucket_dense(Bucket& bucket) { } } - const auto& bucket_view = replica.bucket_views_out[intra_bucket_index]; - runGradCallbackForVariable(variable, [&](auto& grad) { - // If a parameter is globally unused, we keep its grad untouched. - if (!global_unused) { - if (!grad.defined()) { - // Creates grad according to the "Gradient Layout Contract" - // (see torch/csrc/grad/AccumulateGrad.h) - grad = torch::autograd::utils::clone_obey_contract( - bucket_view, variable); - } else { - grad.copy_(bucket_view); - } - // The grad is modified and needs to be written back. - return true; + if (!gradient_as_bucket_view_) { + copy_bucket_to_grad( + variable, replica, intra_bucket_index, global_unused); + } else { + const auto& bucket_view_out = + replica.bucket_views_out[intra_bucket_index]; + auto& bucket_view_in = replica.bucket_views_in[intra_bucket_index]; + // If communication_hook is registered, bucket_view_out stores + // allreduced results in a newly allocated tensor, copy bucket_view_out + // back to bucket_view_in that referring to replica.content tensor and + // grad. + if (!bucket_view_in.is_alias_of(bucket_view_out)) { + bucket_view_in.copy_(bucket_view_out); } - // The grad is not modified. - return false; - }); + runGradCallbackForVariable(variable, [&](auto& grad) { + // If a parameter is globally unused, we keep its grad untouched. + if (!global_unused) { + // If grad is globally used but locally unused, let grad point to + // bucket_view_in + if (!grad.defined()) { + grad = bucket_view_in; + } else { + if (!grad.is_alias_of(bucket_view_in)) { + grad.copy_(bucket_view_in); + TORCH_WARN_ONCE( + "Detected at least one parameter gradient is not the " + "expected DDP bucket view when setting " + "gradient_as_bucket_view=True. This can happen when " + "multiple parameters sharing the same gradient. For " + "example, param0 and param1 share the same gradient " + "grad0. In this case, grad0 would first point to " + "bucket_view_in0 when param0 is ready. Later, when " + "param1 is ready, it will override grad0 to point to " + "bucket_view_in1. However, param0 still expects grad0 " + "to point to bucket_view_in0, and hence hit this " + "warning. If you saw this message, please double-check if " + "the above situation is expected for your application."); + } + } + // The grad is modified and needs to be written back. + return true; + } + // The grad is not modified. + return false; + }); + } } } } @@ -1119,7 +1233,9 @@ void Reducer::runGradCallbackForVariable( cb(variable.mutable_grad()); } else { // Under distributed autograd +#ifndef _WIN32 context_ptr->runGradCallbackForVariable(variable, std::move(cb)); +#endif } } @@ -1225,8 +1341,9 @@ bool Reducer::rebuild_buckets() { replicas_[0].size() == rebuilt_param_indices_.size(), c10::str( "rebuilt parameter indices size is not same as original model parameters size.", + "Original model param size is: ", replicas_[0].size(), - " versus ", + " versus rebuilt params size of: ", rebuilt_param_indices_.size())); std::vector> rebuilt_bucket_indices; std::vector bucket_size_limits; diff --git a/torch/csrc/distributed/c10d/reducer.h b/torch/csrc/distributed/c10d/reducer.h index 87ad60330af7..5a17dbe6f1c2 100644 --- a/torch/csrc/distributed/c10d/reducer.h +++ b/torch/csrc/distributed/c10d/reducer.h @@ -30,7 +30,8 @@ class Reducer { std::shared_ptr process_group, std::vector> expect_sparse_gradients, int64_t bucket_bytes_cap, - bool find_unused_parameters); + bool find_unused_parameters, + bool gradient_as_bucket_view); ~Reducer() noexcept(false); @@ -54,7 +55,7 @@ class Reducer { return backward_stats_; } - // Registeres a hook to the reducer. The hook is `CommHookInterface` + // Registers a hook to the reducer. The hook is `CommHookInterface` // type to allow both Python and CPP hooks. This function can only // be called once before calling backward. void register_comm_hook(std::unique_ptr iface); @@ -89,7 +90,6 @@ class Reducer { // corresponding tensor being reduced. void set_forward_pass_work_handle( std::shared_ptr forwardPassWorkHandle, - at::Tensor& tensor, bool useStaticWorldSize); // Retrieve on-device tensors used to track locally unused parameters. For @@ -104,6 +104,13 @@ class Reducer { struct VariableIndex { size_t replica_index; size_t variable_index; + + VariableIndex() = default; + + VariableIndex(size_t replica_index_, size_t variable_index_) { + replica_index = replica_index_; + variable_index = variable_index_; + } }; void push_rebuilt_params(const VariableIndex& index); @@ -125,6 +132,7 @@ class Reducer { bool has_marked_unused_parameters_; const bool find_unused_parameters_; + const bool gradient_as_bucket_view_; std::vector unused_parameters_; // Locally used parameter maps indicating if parameters are used locally // during the current iteration or no_sync session if no_sync is on. One @@ -180,7 +188,7 @@ class Reducer { // and on the same device can be batched. The tensor that represents the // flattened gradient uses the same type and is placed on the same device. // Buckets are filled as the gradients they hold are computed (triggered by - // autograd hooks). Buckets are reduced in a predetemined order that is + // autograd hooks). Buckets are reduced in a predetermined order that is // identical across processes. struct BucketReplica { // Flattened (1 dimensional) contents of bucket. @@ -231,6 +239,19 @@ class Reducer { // with the result of `future_work`. void populate_bucket_views_out(BucketReplica& replica, at::Tensor& tensor); + // If gradient_as_bucket_view_ is false, after allreduce buckets, + // copy bucket results back to grads. + void copy_bucket_to_grad( + torch::autograd::Variable& variable, + Reducer::BucketReplica& replica, + size_t intra_bucket_index, + bool global_unused); + // Check layout of grad and bucket_view before calling copy_grad_to_bucket + void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view); + // If gradient_as_bucket_view_ is false, before allreduce buckets, + // copy grads to buckets. + void copy_grad_to_bucket(at::Tensor& grad, at::Tensor& bucket_view); + // A bucket holds N bucket replicas (1 per model replica). // // If every bucket in this struct is ready, the reduction can be kicked off. @@ -267,6 +288,13 @@ class Reducer { size_t bucket_index; // Index of parameter in single bucket replica. size_t intra_bucket_index; + + VariableLocator() = default; + + VariableLocator(size_t bucket_index_, size_t intra_bucket_index_) { + bucket_index = bucket_index_; + intra_bucket_index = intra_bucket_index_; + } }; // Map the index of a variable to its location in the bucket structure. diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp index f85adb88dc09..f0b31b5389d2 100644 --- a/torch/csrc/distributed/rpc/init.cpp +++ b/torch/csrc/distributed/rpc/init.cpp @@ -27,12 +27,11 @@ namespace rpc { namespace { constexpr std::chrono::milliseconds kDeleteAllUsersTimeout(100000); -constexpr float kSecToMsConversion = 1000; template using shared_ptr_class_ = py::class_>; -PyObject* rpc_init(PyObject* /* unused */) { +PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { auto rpc_module = THPObjectPtr(PyImport_ImportModule("torch.distributed.rpc")); if (!rpc_module) { @@ -50,6 +49,11 @@ PyObject* rpc_init(PyObject* /* unused */) { :meth:`~torch.distributed.rpc.init_rpc` in order to initialize RPC with specific configurations, such as the RPC timeout and ``init_method`` to be used. )") + .def(py::init<>()) + .def( + py::init(), + py::arg("rpc_timeout") = kDefaultRpcTimeoutSeconds, + py::arg("init_method") = kDefaultInitMethod) .def_readwrite( "rpc_timeout", &RpcBackendOptions::rpcTimeoutSeconds, @@ -77,7 +81,7 @@ PyObject* rpc_init(PyObject* /* unused */) { be constructed directly, rather, an instance can be retrieved through :meth:`~torch.distributed.rpc.get_worker_info` and the result can be passed in to functions such as - :meth:`~torch.distributed.rpc.rpc_sync`, :class:`~torch.distributed.rpc.rpc_async`, + :meth:`~torch.distributed.rpc.rpc_sync`, :meth:`~torch.distributed.rpc.rpc_async`, :meth:`~torch.distributed.rpc.remote` to avoid copying a string on every invocation.)") .def( @@ -773,7 +777,7 @@ PyObject* rpc_init(PyObject* /* unused */) { } // namespace static PyMethodDef methods[] = { // NOLINT - {"_rpc_init", (PyCFunction)rpc_init, METH_NOARGS, nullptr}, + {"_rpc_init", rpc_init, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr}}; PyMethodDef* python_functions() { diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp index fe93e43d01f3..d97577724a55 100644 --- a/torch/csrc/distributed/rpc/process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/process_group_agent.cpp @@ -8,12 +8,6 @@ namespace torch { namespace distributed { namespace rpc { -const std::string kRPCTimeoutErrorStr = - "RPC ran for more than {} milliseconds and timed out."; - -namespace { -constexpr auto kSecToMsConversion = 1000; -} ////////////////////////// MessageCounter ///////////////////////////////// @@ -802,7 +796,7 @@ void ProcessGroupAgent::pollTimedOutRPCs() { for (const auto& timedOutFuture : timedOutFutures) { auto errStr = - fmt::format(kRPCTimeoutErrorStr, timedOutFuture.timeout_.count()); + fmt::format(kRpcTimeoutErrorStr, timedOutFuture.timeout_.count()); auto err = makeRPCError(errStr, RPCErrorType::TIMEOUT); if (!timedOutFuture.future_->hasError()) { diff --git a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h index f4baed5218b6..b45026b184fe 100644 --- a/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h +++ b/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h @@ -51,8 +51,7 @@ class State { // parse_cpu_trace(result) for results of all profile range. std::mutex resultsMutex_; std::vector results_; - const ProfilerConfig config_ = - ProfilerConfig(ProfilerState::Disabled, false, false); + const ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); }; class StateStackEntry; diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp index b68cb4092b67..c429fde123c6 100644 --- a/torch/csrc/distributed/rpc/request_callback_impl.cpp +++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp @@ -502,6 +502,14 @@ void RequestCallbackImpl::processRpcWithErrors( } } +bool RequestCallbackImpl::cudaAvailable() const { + #ifdef USE_CUDA + return true; + #else + return false; + #endif +} + } // namespace rpc } // namespace distributed } // namespace torch diff --git a/torch/csrc/distributed/rpc/request_callback_impl.h b/torch/csrc/distributed/rpc/request_callback_impl.h index 0591cc88c7d0..836e496fb069 100644 --- a/torch/csrc/distributed/rpc/request_callback_impl.h +++ b/torch/csrc/distributed/rpc/request_callback_impl.h @@ -54,6 +54,8 @@ class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython { const MessageType& messageType, const int64_t messageId, const std::shared_ptr& responseFuture) const override; + + bool cudaAvailable() const override; }; } // namespace rpc diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp index 9aa1f2b2aa55..d41c8f271104 100644 --- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp +++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp @@ -482,97 +482,37 @@ void RequestCallbackNoPython::processRpc( case MessageType::RUN_WITH_PROFILING_REQ: { auto& rpcWithProfilingReq = static_cast(rpc); auto wrappedMsgType = rpcWithProfilingReq.wrappedMessageType(); - const auto profilingConfig = rpcWithProfilingReq.getProfilingConfig(); + auto profilingConfig = rpcWithProfilingReq.getProfilingConfig(); + // If requested with CUDA from caller but CUDA is not available on this + // machine, fallback to CPU and log a warning instead of crashing. + if (profilingConfig.state == + torch::autograd::profiler::ProfilerState::CUDA && + !this->cudaAvailable()) { + profilingConfig = torch::autograd::profiler::ProfilerConfig( + torch::autograd::profiler::ProfilerState::CPU, + profilingConfig.report_input_shapes, + profilingConfig.profile_memory); + + LOG(WARNING) + << "Profiler was requested to be enabled with CUDA on this node, but CUDA is not available. " + << "Falling back to CPU profiling only."; + } + TORCH_INTERNAL_ASSERT( + profilingConfig.state != + torch::autograd::profiler::ProfilerState::CUDA || + this->cudaAvailable(), + "Profiler state set to CUDA but CUDA not available."); const auto profilingKeyId = rpcWithProfilingReq.getProfilingId(); auto wrappedRpcResponseFuture = std::make_shared(); // Enable the profiler with the config from the sender. - std::vector profiledEvents; + // When enabling on the main thread, ensure profiler states are cleaned + // up, but defer consolidation of all profiled events to the continuation + // below. + torch::autograd::profiler::ProfilerDisableOptions requestThreadOptions( + true /* cleanup TLS state */, false /* consolidate events */); { torch::autograd::profiler::TLSProfilerGuard g( - profilingConfig, - [&profiledEvents, profilingConfig]( - const std::vector>& event_lists) { - // Gather all events into a vector - for (auto& l : event_lists) { - for (auto& e : l) { - profiledEvents.push_back(e); - } - } - // find __start_profile event and __cuda_start_event. - bool cuda_profiling_enabled = profilingConfig.state == - torch::autograd::profiler::ProfilerState::CUDA; - bool found_cpu_start = false; - const torch::autograd::profiler::Event* profilerStart = nullptr; - // Each device has its own cudaProfilerStart, so we must take - // care to use the correct one depending on the device the - // operation ran on. - std::unordered_map - cudaProfilerStarts; - for (auto& e : profiledEvents) { - if (!found_cpu_start && - 0 == strcmp(e.name(), "__start_profile")) { - profilerStart = &e; - found_cpu_start = true; - } - if (cuda_profiling_enabled && - 0 == strcmp(e.name(), "__cuda_start_event")) { - e.setCudaUs(e.cpu_us()); - auto device = e.device(); - TORCH_CHECK( - device != -1, - "CUDA profiling was enabled but could not find CUDA device."); - TORCH_CHECK( - cudaProfilerStarts.find(device) == - cudaProfilerStarts.end(), - c10::str( - "Duplicate __cuda_start_event found for ", device)); - cudaProfilerStarts[device] = &e; - } - // TODO: determine no. of CUDA devices and break here if we have - // a cudaProfilerStart for all of them, in the case of cuda - // profiling. - if (found_cpu_start && !cuda_profiling_enabled) { - break; - } - } - // We should always find __start_profile. - TORCH_CHECK( - profilerStart != nullptr, - "Expected to find __start_profile event."); - // Should have >= 1 CUDA start event. - // TODO: we can enhance this assert by ensuring we have found a - // start for every available CUDA device. - TORCH_CHECK( - !cuda_profiling_enabled || cudaProfilerStarts.size() > 0, - "Profiler was enabled with CUDA recording, but did not find __cuda_start_event."); - - if (cuda_profiling_enabled) { - // Compute and set global time for when this CUDA kernel was - // launched/ended, since deserialized event will not have a - // corresponding CUDA event. - for (auto& e : profiledEvents) { - if (e.has_cuda()) { - auto cuda_device = e.device(); - TORCH_CHECK( - cuda_device != -1, - "CUDA profiling was enabled but could not find CUDA device."); - auto it = cudaProfilerStarts.find(cuda_device); - TORCH_CHECK( - it != cudaProfilerStarts.end(), - c10::str( - "Failed to find __cuda_start_event for device ", - cuda_device)); - auto cudaProfilerStartEvent = it->second; - double cuda_elapsed_us = - cudaProfilerStartEvent->cuda_elapsed_us(e); - int64_t cuda_us = - cuda_elapsed_us + cudaProfilerStartEvent->cpu_us(); - e.setCudaUs(cuda_us); - } - } - } - }); + profilingConfig, c10::nullopt, requestThreadOptions); TORCH_INTERNAL_ASSERT( torch::autograd::profiler::profilerEnabled(), "Expected profiler to be enabled!"); @@ -583,25 +523,48 @@ void RequestCallbackNoPython::processRpc( wrappedMsgType, messageId, wrappedRpcResponseFuture); - } - wrappedRpcResponseFuture->addCallback([wrappedRpcResponseFuture, + + wrappedRpcResponseFuture->addCallback( + at::wrapPropagateTLSState([wrappedRpcResponseFuture, responseFuture, - profiledEvents = - std::move(profiledEvents), - profilingKeyId] { - if (wrappedRpcResponseFuture->hasError()) { - // Propagate error - responseFuture->setError(wrappedRpcResponseFuture->error()->what()); - } else { - auto rpcWithProfilingResp = std::make_unique( - MessageType::RUN_WITH_PROFILING_RESP, - std::move(*wrappedRpcResponseFuture).moveValue(), - profiledEvents, - profilingKeyId); - responseFuture->markCompleted( - std::move(*rpcWithProfilingResp).toMessage()); - } - }); + profilingKeyId, + profilingConfig] { + std::vector profiledEvents; + // Defer consolidation of profiler events until async work has + // completed (such as async UDF) + + TORCH_INTERNAL_ASSERT( + torch::autograd::profiler::profilerEnabled(), + "Expected profiler to be enabled!"); + + // On continuation thread, don't clean up profiler states, since + // they will be cleaned up by main thread, and consolidate all + // events so we obtain asynchronously run events. + torch::autograd::profiler::ProfilerDisableOptions opts( + false, true); + auto event_lists = + torch::autograd::profiler::disableProfiler(opts); + if (wrappedRpcResponseFuture->hasError()) { + // Propagate error + // No need to propagate remote events in the case of an error. + responseFuture->setError( + wrappedRpcResponseFuture->error()->what()); + } else { + populateRemoteProfiledEvents( + profiledEvents, profilingConfig, event_lists); + auto rpcWithProfilingResp = + std::make_unique( + MessageType::RUN_WITH_PROFILING_RESP, + std::move(*wrappedRpcResponseFuture).moveValue(), + profiledEvents, + profilingKeyId); + responseFuture->markCompleted( + std::move(*rpcWithProfilingResp).toMessage()); + } + })); + // Exiting the scope will disable the profiler on this thread with the + // options specified above. + } return; } default: { @@ -627,6 +590,14 @@ Message RequestCallbackNoPython::handleError( return createExceptionResponse(errorMsg, messageId); } +bool RequestCallbackNoPython::cudaAvailable() const { + #ifdef USE_CUDA + return true; + #else + return false; + #endif +} + } // namespace rpc } // namespace distributed } // namespace torch diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.h b/torch/csrc/distributed/rpc/request_callback_no_python.h index dd54ea009417..b54fe172d7b6 100644 --- a/torch/csrc/distributed/rpc/request_callback_no_python.h +++ b/torch/csrc/distributed/rpc/request_callback_no_python.h @@ -84,6 +84,8 @@ class TORCH_API RequestCallbackNoPython : public RequestCallback { const std::exception& e, const MessageType messageType, int64_t messageId) const; + + virtual bool cudaAvailable() const; }; } // namespace rpc diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h index 605744a1f227..34b77a085510 100644 --- a/torch/csrc/distributed/rpc/rpc_agent.h +++ b/torch/csrc/distributed/rpc/rpc_agent.h @@ -17,6 +17,9 @@ constexpr float kDefaultRpcTimeoutSeconds = 60; // timeout for RPCs. constexpr float kUnsetRpcTimeout = -1; constexpr auto kDefaultInitMethod = "env://"; +constexpr float kSecToMsConversion = 1000; +constexpr auto kRpcTimeoutErrorStr = + "RPC ran for more than set timeout ({} ms) and will now be marked with an error"; using steady_clock_time_point = std::chrono::time_point; diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp index 34249172473c..6c6a377a4652 100644 --- a/torch/csrc/distributed/rpc/rref_impl.cpp +++ b/torch/csrc/distributed/rpc/rref_impl.cpp @@ -141,9 +141,6 @@ IValue UserRRef::toHere(const float timeoutSeconds) const { "to_here#({})->({})", RpcAgent::getCurrentRpcAgent()->getWorkerInfo().name_, RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_).name_); - auto& remoteProfilerManager = - torch::distributed::rpc::RemoteProfilerManager::getInstance(); - remoteProfilerManager.setCurrentKey(toHereKey); } RECORD_USER_SCOPE(toHereKey); TORCH_CHECK( @@ -170,12 +167,16 @@ IValue UserRRef::toHere(const float timeoutSeconds) const { msgToSend = ScriptRRefFetchCall(ownerId_, rrefId()).toMessage(); } + // toHere is profiled as a blocking call, and does not execute operations on + // the remote node. Hence, don't wrap it with a profiling message since we + // don't need the profiler to be enabled remotely. auto futureResponse = autograd::sendMessageWithAutograd( *agent, agent->getWorkerInfo(ownerId_), std::move(msgToSend), true /* forceGradRecording */, - timeoutSeconds); + timeoutSeconds, + true /* forceDisableProfiling */); // TODO: we should ideally be able to interrupt this blocking wait if we check // getTimedOut() and it is true diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index d9ce2c3b27eb..11c5408c2c35 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -22,16 +22,12 @@ namespace { const std::string kSocketIfnameEnvVar = "TP_SOCKET_IFNAME"; const std::string kDefaultUvAddress = "127.0.0.1"; -constexpr long kToMilliseconds = 1000; - const std::string kGilAverageWaitTime = "agent.gil_average_wait_time_us"; const std::string kThreadPoolSize = "agent.thread_pool_size"; const std::string kNumIdleThreads = "agent.num_idle_threads"; const std::string kClientActiveCalls = "agent.client_active_calls"; const std::string kServerActiveCalls = "agent.server_active_calls"; const std::string kServerActiveAsyncCalls = "agent.server_active_async_calls"; -const std::string kRpcTimeoutErrorStr = - "RPC ran for more than set timeout ({} ms) and will now be marked with an error"; inline void checkCPUTensor(const torch::Tensor& tensor) { TORCH_CHECK( @@ -273,7 +269,7 @@ TensorPipeAgent::TensorPipeAgent( WorkerInfo(std::move(selfName), selfId), std::move(cb), std::chrono::milliseconds( - (long)(opts.rpcTimeoutSeconds * kToMilliseconds))), + (long)(opts.rpcTimeoutSeconds * kSecToMsConversion))), opts_(std::move(opts)), threadPool_(opts_.numWorkerThreads), context_(std::make_shared( @@ -685,7 +681,7 @@ std::shared_ptr TensorPipeAgent::send( auto timeout = rpcTimeoutSeconds == kUnsetRpcTimeout ? getRpcTimeout() : std::chrono::milliseconds( - static_cast(rpcTimeoutSeconds * kToMilliseconds)); + static_cast(rpcTimeoutSeconds * kSecToMsConversion)); // We only add to the timeoutMap_ if the timeout is not 0. Per our // documentation, a user-provided timeout of 0 indicates the RPC should never diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp index a03ff5cafecd..a1be688a285e 100644 --- a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp @@ -6,10 +6,6 @@ namespace torch { namespace distributed { namespace rpc { -namespace { -constexpr auto kSecToMsConversion = 1000; -} - std::string fromVec(const std::vector& vec) { return std::string(vec.begin(), vec.end()); } diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp index cdb67e2ea6b5..a662faed88ba 100644 --- a/torch/csrc/distributed/rpc/testing/init.cpp +++ b/torch/csrc/distributed/rpc/testing/init.cpp @@ -17,7 +17,7 @@ namespace { template using shared_ptr_class_ = py::class_>; -PyObject* faulty_agent_init(PyObject* /* unused */) { +PyObject* faulty_agent_init(PyObject* _unused, PyObject* noargs) { // Add the FaultyProcessGroupAgent and its backend options object to the // python module torch.distributed.rpc._testing auto faulty_agent_module = @@ -110,7 +110,7 @@ PyObject* faulty_agent_init(PyObject* /* unused */) { static PyMethodDef methods[] = { // NOLINT {"_faulty_agent_init", - (PyCFunction)faulty_agent_init, + faulty_agent_init, METH_NOARGS, nullptr}, {nullptr, nullptr, 0, nullptr}}; diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp index 981cfd50f95e..fa97ea116a0c 100644 --- a/torch/csrc/distributed/rpc/utils.cpp +++ b/torch/csrc/distributed/rpc/utils.cpp @@ -501,6 +501,85 @@ std::vector readWrappedPayload( payload.resize(payload.size() - additionalPayloadSize); return tupleElements; } + +void populateRemoteProfiledEvents( + std::vector& profiledEvents, + const torch::autograd::profiler::ProfilerConfig& profilingConfig, + const std::vector>& + eventLists) { + // Gather all events into a vector + for (auto& l : eventLists) { + for (auto& e : l) { + profiledEvents.push_back(e); + } + } + // find __start_profile event and __cuda_start_event. + bool cudaProfilingEnabled = + profilingConfig.state == torch::autograd::profiler::ProfilerState::CUDA; + bool foundCpuStart = false; + const torch::autograd::profiler::Event* profilerStart = nullptr; + // Each device has its own cudaProfilerStart, so we must take + // care to use the correct one depending on the device the + // operation ran on. + std::unordered_map + cudaProfilerStarts; + for (auto& e : profiledEvents) { + if (!foundCpuStart && 0 == strcmp(e.name(), "__start_profile")) { + profilerStart = &e; + foundCpuStart = true; + } else if (cudaProfilingEnabled && 0 == strcmp(e.name(), "__cuda_start_event")) { + e.setCudaUs(e.cpuUs()); + auto device = e.device(); + TORCH_CHECK( + device != -1, + "CUDA profiling was enabled but could not find CUDA device."); + TORCH_CHECK( + cudaProfilerStarts.find(device) == cudaProfilerStarts.end(), + c10::str("Duplicate __cuda_start_event found for ", device)); + cudaProfilerStarts[device] = &e; + } + + // TODO: determine no. of CUDA devices and break here if we have + // a cudaProfilerStart for all of them, in the case of cuda + // profiling. + if (foundCpuStart && !cudaProfilingEnabled) { + break; + } + } + // We should always find __start_profile. + TORCH_CHECK( + profilerStart != nullptr, "Expected to find __start_profile event."); + // Should have >= 1 CUDA start event if cudaProfilingEnabled. + // TODO: we can enhance this assert by ensuring we have found a + // start for every available CUDA device. + TORCH_CHECK( + !cudaProfilingEnabled || cudaProfilerStarts.size() > 0, + "Profiler was enabled with CUDA recording, but did not find __cuda_start_event."); + + if (cudaProfilingEnabled) { + // Compute and set global time for when this CUDA kernel was + // launched/ended, since deserialized event will not have a + // corresponding CUDA event. + for (auto& e : profiledEvents) { + if (e.hasCuda()) { + auto cudaDevice = e.device(); + TORCH_CHECK( + cudaDevice != -1, + "CUDA profiling was enabled but could not find CUDA device."); + auto it = cudaProfilerStarts.find(cudaDevice); + TORCH_CHECK( + it != cudaProfilerStarts.end(), + c10::str( + "Failed to find __cuda_start_event for device ", cudaDevice)); + auto cudaProfilerStartEvent = it->second; + double cudaElapsedUs = cudaProfilerStartEvent->cudaElapsedUs(e); + int64_t cudaUs = cudaElapsedUs + cudaProfilerStartEvent->cpuUs(); + e.setCudaUs(cudaUs); + } + } + } +} + } // namespace rpc } // namespace distributed } // namespace torch diff --git a/torch/csrc/distributed/rpc/utils.h b/torch/csrc/distributed/rpc/utils.h index 806b52208eb0..f91dfb4f4c7d 100644 --- a/torch/csrc/distributed/rpc/utils.h +++ b/torch/csrc/distributed/rpc/utils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -78,6 +79,14 @@ TORCH_API std::vector readWrappedPayload( std::vector& payload, const rpc::Message& message); +// Takes a list of events from autograd profiler and populates them into +// profiledEvents to be carried over RPC. +TORCH_API void populateRemoteProfiledEvents( + std::vector& profiledEvents, + const torch::autograd::profiler::ProfilerConfig& profilerConfig, + const std::vector>& + eventLists); + } // namespace rpc } // namespace distributed } // namespace torch diff --git a/torch/csrc/jit/backends/backend_init.cpp b/torch/csrc/jit/backends/backend_init.cpp index b01cb62dc3a2..17c92cb14023 100644 --- a/torch/csrc/jit/backends/backend_init.cpp +++ b/torch/csrc/jit/backends/backend_init.cpp @@ -226,11 +226,13 @@ void initJitBackendBindings(PyObject* module) { m.def( "_jit_to_backend", [=](const std::string& backend_name, - const Module& orig_module, + py::handle orig_module, const py::dict& method_compile_spec) { return py::module::import("torch.jit._recursive") - .attr("wrap_cpp_module")( - codegen_lambda(backend_name, orig_module, method_compile_spec)); + .attr("wrap_cpp_module")(codegen_lambda( + backend_name, + py::cast(orig_module.attr("_c")), + method_compile_spec)); }); } } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp new file mode 100644 index 000000000000..f6e791f0edba --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -0,0 +1,640 @@ + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +namespace { + +class CudaKernelGenerator : private OptInConstDispatch { + static constexpr char* kTab = " "; + + public: + static std::string generateKernelDefinition( + const Kernel* kernel, + const std::string& kernel_name) { + CudaKernelGenerator codegen(kernel); + codegen.genDeclaration(kernel_name); + codegen.startBlock(); + codegen.genPrologue(); + codegen.genBody(); + codegen.endBlock(); + TORCH_CHECK(codegen.block_nest_level_ == 0); + return codegen.code_.str(); + } + + private: + explicit CudaKernelGenerator(const Kernel* kernel) : kernel_(kernel) {} + + // Generates the kernel function declaration + void genDeclaration(const std::string& kernel_name) { + const auto& kernel_summary = kernel_->summary(); + + code_ << "__global__ void " << kernel_name << "("; + + std::vector params; + + // Inputs + for (auto val : kernel_->inputs()) { + params.push_back(val); + } + + // Outputs + for (auto val : kernel_->outputs()) { + params.push_back(val); + } + + // Global buffers + for (auto allocate : kernel_summary.global_allocations) { + params.push_back(allocate->buffer()); + } + + // Generate parameter declarations + for (Val* val : params) { + switch (val->getValType().value()) { + case ValType::KirTensorView: { + // TODO(kir): review this + const auto tv = val->as(); + code_ << "Tensor<" << val->getDataType().value() << ", " + << TensorDomain::noReductions( + tv->fuserTv()->getMaybeRFactorDomain()) + .size() + << "> " << gen(tv); + break; + } + case ValType::KirScalar: + code_ << val->getDataType().value() << " " << gen(val); + break; + default: + TORCH_CHECK(!"Unexpected parameter type"); + } + + if (val != params.back()) { + code_ << ", "; + } + } + + // Kernels generating random numbers take extra (seed, offset) arguments + if (kernel_summary.is_stochastic) { + code_ << ", unsigned long long seed, unsigned long long offset"; + } + + code_ << ") "; + } + + // Generates setup code which is executed before the kernel body + void genPrologue() { + const auto& kernel_summary = kernel_->summary(); + + // Random number generator (optional) + if (kernel_summary.is_stochastic) { + indent() << "const int idx = blockIdx.x*blockDim.x + threadIdx.x;\n"; + indent() << "Philox rnd(seed, idx, offset);\n"; + } + + // Do we have any dynamic shared memory buffers? + const bool has_dynamic_smem = + !kernel_summary.dynamic_smem_allocations.empty(); + + // Do we have any reductions? + const bool has_reductions = kernel_summary.has_block_reductions || + kernel_summary.has_grid_reductions; + + // Shared memory + if (has_dynamic_smem || has_reductions) { + indent() << "alignas(" + << dataTypeSize(kernel_summary.largest_smem_data_type) + << ") extern __shared__ char array[];\n"; + + if (has_dynamic_smem) { + indent() << "unsigned offset = 0;\n"; + } + + if (has_reductions) { + indent() << "void* shared_mem = array;\n"; + if (has_dynamic_smem) { + indent() << "offset += " + << "((blockDim.x * blockDim.y * blockDim.z) * sizeof(" + << kernel_summary.largest_smem_data_type << "));\n"; + } + } + } + } + + void genBody() { + for (auto expr : kernel_->topLevelExprs()) { + OptInConstDispatch::handle(expr); + } + } + + void startBlock(bool continuation = false) { + if (continuation) { + code_ << "{\n"; + } else { + indent() << "{\n"; + } + ++block_nest_level_; + } + + void endBlock(const char* sep = "\n") { + --block_nest_level_; + TORCH_CHECK(block_nest_level_ >= 0); + indent() << "}" << sep; + } + + std::ostream& indent() { + for (int i = 0; i < block_nest_level_; ++i) { + code_ << kTab; + } + return code_; + } + + std::string gen(const Statement* stmt) { + std::stringstream tmp_code; + std::swap(tmp_code, code_); + handle(stmt); + std::swap(tmp_code, code_); + return tmp_code.str(); + } + + std::string gen(const kir::TensorView* tv) { + std::stringstream tv_name; + tv_name << "T" << tv->name(); + return tv_name.str(); + } + + std::string genInline(const Statement* stmt) { + const bool saved_inline = print_inline_; + print_inline_ = true; + const auto result = gen(stmt); + print_inline_ = saved_inline; + return result; + } + + void handle(const Statement* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const Expr* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const Val* node) final { + OptInConstDispatch::handle(node); + } + + void handle(const kir::Bool* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "b" << node->name(); + } else { + code_ << *node->value(); + } + } + + void handle(const kir::Float* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "f" << node->name(); + } else { + const int digits = std::numeric_limits::max_digits10; + code_ << "float(" << std::setprecision(digits) << *node->value() << ")"; + } + } + + void handle(const kir::Half* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "h" << node->name(); + } else { + code_ << "__float2half(" << *node->value() << ")"; + } + } + + void handle(const kir::Int* node) final { + const auto def = node->getOrigin(); + if (print_inline_ && def != nullptr) { + code_ << "(" << gen(def) << ")"; + } else if (node->isSymbolic()) { + code_ << "i" << node->name(); + } else { + code_ << *node->value(); + } + } + + void handle(const kir::NamedScalar* node) final { + code_ << node->name(); + } + + void handle(const kir::TensorIndex* node) final { + code_ << gen(node->view()) << "["; + + bool first = true; + for (auto* ind : node->indices()) { + if (!ind->isZeroInt()) { + if (!first) { + code_ << " + "; + } + code_ << genInline(ind); + first = false; + } + } + + if (first) { + code_ << "0"; + } + + code_ << "]"; + } + + void handle(const kir::IterDomain* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::TensorDomain* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::TensorView* node) final { + TORCH_INTERNAL_ASSERT(!"Unreachable"); + } + + void handle(const kir::UnaryOp* node) final { + if (!print_inline_) { + indent() << gen(node->out()); + if (!node->out()->isScalar() && !node->in()->isScalar()) { + code_ << "\n"; + indent() << kTab; + } + code_ << " = "; + } + + if (auto op = inline_op_str(node->getUnaryOpType())) { + code_ << *op << gen(node->in()); + } else { + if (node->getUnaryOpType() == UnaryOpType::Cast) { + const auto cast_str = + cast_func_str({node->in()->getDataType().value(), + node->out()->getDataType().value()}); + code_ << cast_str.value(); + } else { + code_ << node->getUnaryOpType(); + } + + code_ << "("; + if (node->getUnaryOpType() == UnaryOpType::RandLike) { + code_ << "rnd"; + } else { + code_ << gen(node->in()); + } + code_ << ")"; + } + + if (!print_inline_) { + code_ << ";\n"; + } + } + + std::string genBinaryOp( + BinaryOpType op_type, + const std::string& lhs, + const std::string& rhs) { + std::stringstream expr; + if (auto op = inline_op_str(op_type)) { + expr << lhs << " " << *op << " " << rhs; + } else { + expr << op_type << "(" << lhs << ", " << rhs << ")"; + } + return expr.str(); + } + + void handle(const kir::BinaryOp* node) final { + const auto op_type = node->getBinaryOpType(); + if (print_inline_) { + // Inline expression: `lhs op rhs` + code_ << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs())); + } else { + indent() << gen(node->out()); + if (node->out()->isScalar()) { + // Single line: `out = lhs op rhs;` + code_ << " = " + << genBinaryOp(op_type, gen(node->lhs()), gen(node->rhs())); + } else { + // Split TensorView expressions across multiple lines: + // + // out + // = lhs + // op rhs; + // + if (auto op = inline_op_str(op_type)) { + code_ << "\n"; + indent() << kTab << "= " << gen(node->lhs()) << "\n"; + indent() << kTab << *op << " " << gen(node->rhs()); + } else { + code_ << " = " << op_type << "(\n"; + indent() << kTab << gen(node->lhs()) << ",\n"; + indent() << kTab << gen(node->rhs()) << ")"; + } + } + code_ << ";\n"; + } + } + + void handle(const kir::TernaryOp* node) final { + if (!print_inline_) { + indent() << gen(node->out()); + if (!node->out()->isScalar()) { + code_ << "\n"; + indent() << kTab; + } + code_ << " = "; + } + + code_ << node->getTernaryOpType() << "(" << gen(node->in1()) << ", " + << gen(node->in2()) << ", " << gen(node->in3()) << ")"; + + if (!print_inline_) { + code_ << ";\n"; + } + } + + std::string genReductionOp(BinaryOpType op_type, DataType data_type) { + std::stringstream lambda; + lambda << "[](" << data_type << " &a, " << data_type << " b) " + << "{ a = " << genBinaryOp(op_type, "a", "b") << "; }"; + return lambda.str(); + } + + void handle(const kir::BroadcastOp* node) final { + const ir_utils::ParallelTypeBitmap domains = + ir_utils::getParallelBroadcastDomains( + node->out(), kernel_->predicateMap()); + + const bool thread_x = domains.get(ParallelType::TIDx); + const bool thread_y = domains.get(ParallelType::TIDy); + const bool thread_z = domains.get(ParallelType::TIDz); + const bool block_x = domains.get(ParallelType::BIDx); + const bool block_y = domains.get(ParallelType::BIDy); + const bool block_z = domains.get(ParallelType::BIDz); + + const bool grid_broadcast_needed = block_x || block_y || block_z; + const bool block_broadcast_needed = thread_x || thread_y || thread_z; + + TORCH_INTERNAL_ASSERT( + !grid_broadcast_needed, + "Parallel broadcast across blocks not supported"); + + if (block_broadcast_needed) { + const auto data_type = node->out()->getDataType().value(); + indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false") + << ", " << (thread_y ? "true" : "false") << ", " + << (thread_z ? "true" : "false") << ">(\n"; + indent() << kTab << gen(node->out()) << ",\n"; + indent() << kTab << gen(node->in()) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem));\n"; + } else { + indent() << gen(node->out()) << "\n"; + indent() << kTab << " = " << gen(node->in()) << ";\n"; + } + } + + void handle(const kir::ReductionOp* node) final { + TORCH_CHECK(node->out()->getValType() == ValType::TensorIndex); + + const auto out = node->out()->as(); + const auto domain = out->view()->domain(); + + const bool has_block_reduce = domain->hasBlockReduction(); + const bool has_grid_reduce = domain->hasGridReduction(); + + if (!has_block_reduce && !has_grid_reduce) { + const auto gen_out = gen(out); + const auto op_type = node->getReductionOpType(); + indent() << gen_out << " = " + << genBinaryOp(op_type, gen_out, gen(node->in())) << ";\n"; + return; + } + + const auto par_domains = node->getParallelReductionDomains(); + const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end(); + const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end(); + const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end(); + + const auto data_type = node->out()->getDataType().value(); + const auto op_type = node->getReductionOpType(); + + if (has_block_reduce) { + if (has_grid_reduce) { + indent() << data_type << " " + << "block_result" + << ";\n"; + } + indent() << "blockReduce<" << (tidx ? "true" : "false") << ", " + << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") + << ">(\n"; + if (has_grid_reduce) { + indent() << kTab << "block_result" + << ",\n"; + } else { + indent() << kTab << gen(node->out()) << ",\n"; + } + indent() << kTab << gen(node->in()) << ",\n"; + indent() << kTab << genReductionOp(op_type, data_type) << ",\n"; + indent() << kTab << "threadIdx,\n"; + indent() << kTab << "blockDim,\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + if (node->pred() == nullptr) { + indent() << kTab << "true,\n"; + } else { + indent() << kTab << genInline(node->pred()) << ",\n"; + } + indent() << kTab << genInline(node->init()) << ");\n"; + } + } + + void handle(const kir::GridReduction* node) final { + const auto rop = node->reduction_op(); + TORCH_INTERNAL_ASSERT(rop->out()->getValType() == ValType::TensorIndex); + + const auto out = rop->out()->as(); + const auto domain = out->view()->domain(); + TORCH_INTERNAL_ASSERT(domain->hasGridReduction()); + + const auto par_domains = rop->getParallelReductionDomains(); + const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end(); + const bool tidy = par_domains.find(ParallelType::TIDy) != par_domains.end(); + const bool tidz = par_domains.find(ParallelType::TIDz) != par_domains.end(); + const bool bidx = par_domains.find(ParallelType::BIDx) != par_domains.end(); + const bool bidy = par_domains.find(ParallelType::BIDy) != par_domains.end(); + const bool bidz = par_domains.find(ParallelType::BIDz) != par_domains.end(); + + const auto data_type = rop->out()->getDataType().value(); + const auto op_type = rop->getReductionOpType(); + + TORCH_INTERNAL_ASSERT( + node->reduction_buffer()->buffer()->getValType().value() == + ValType::KirTensorView); + TORCH_INTERNAL_ASSERT( + node->sync_buffer()->buffer()->getValType().value() == + ValType::KirTensorView); + const auto work_buffer = + node->reduction_buffer()->buffer()->as(); + const auto sync_buffer = + node->sync_buffer()->buffer()->as(); + + // Since block-level reduction is already done, those dimensions + // with tidx/y/z being true do not participate in the grid reduction. + indent() << kir::GridReduction::getPredicateFlagName(out->view()) << " = " + << "reduction::gridReduce<" << (bidx ? "true" : "false") << ", " + << (bidy ? "true" : "false") << ", " << (bidz ? "true" : "false") + << ", " << (!tidx ? "true" : "false") << ", " + << (!tidy ? "true" : "false") << ", " << (!tidz ? "true" : "false") + << ">(\n"; + indent() << kTab << gen(rop->out()) << ",\n"; + if (domain->hasBlockReduction()) { + indent() << kTab << "block_result" + << ",\n"; + } else { + indent() << kTab << gen(rop->in()) << ",\n"; + } + indent() << kTab << genReductionOp(op_type, data_type) << ",\n"; + indent() << kTab << "&" << gen(work_buffer) << "[0],\n"; + indent() << kTab << gen(sync_buffer) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + if (node->pred() == nullptr) { + indent() << kTab << "true,\n"; + } else { + indent() << kTab << genInline(node->pred()) << ",\n"; + } + indent() << kTab << genInline(node->reduction_op()->init()) << ");\n"; + } + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Woverloaded-virtual" + // TODO(Kir): fix me + void handle(const kir::Scope& scope) { + for (auto expr : scope.exprs()) { + handle(expr); + } + } +#pragma clang diagnostic pop + + void handle(const kir::ForLoop* node) final { + // TODO(kir): handle this during lowering + if (node->iter_domain()->isThread() || node->iter_domain()->isBroadcast()) { + handle(node->body()); + return; + } + + const auto gen_index = gen(node->index()); + const auto gen_start = genInline(node->iter_domain()->start()); + const auto gen_extent = genInline(node->iter_domain()->extent()); + indent() << "for(size_t " << gen_index << " = " << gen_start << "; " + << gen_index << " < " << gen_extent << "; ++" << gen_index << ") "; + + startBlock(true); + handle(node->body()); + endBlock(); + } + + void handle(const kir::IfThenElse* node) final { + indent() << "if (" << genInline(node->cond()) << ") "; + + // "then" block + startBlock(true); + handle(node->thenBody()); + + // "else" block (optional) + if (node->hasElse()) { + endBlock(" else "); + startBlock(true); + handle(node->elseBody()); + } + + endBlock(); + } + + // TODO(kir): fold initialization into Allocate + void handle(const kir::Allocate* node) final { + if (node->buffer()->getValType().value() != ValType::KirTensorView) { + indent() << node->buffer_type() << " " << gen(node->buffer()) << ";\n"; + return; + } + + const auto tv = node->buffer()->as(); + TORCH_INTERNAL_ASSERT(tv->domain()->nDims() > 0); + TORCH_INTERNAL_ASSERT(node->size() != nullptr); + + switch (tv->memoryType()) { + case MemoryType::Global: + indent() << "// Allocate global tensor " << gen(tv) << "\n"; + break; + case MemoryType::Shared: + if (node->size()->isConstScalar()) { + // Static shared memory + indent() << "__shared__ " << node->buffer_type() << " " << gen(tv) + << "[" << genInline(node->size()) << "];\n"; + } else { + // Align Offset Position + indent() << "offset = alignBufferSize(offset," + << dataTypeSize(node->buffer_type()) << ");\n"; + // Shared Memory Pointer + indent() << node->buffer_type() << "* " << gen(tv) + << " = reinterpret_cast<" << node->buffer_type() << "*>" + << "(array + offset);\n"; + // Increment Offset Position + indent() << "offset += (" << genInline(node->size()) << " * sizeof(" + << node->buffer_type() << "));\n"; + } + break; + case MemoryType::Local: + indent() << node->buffer_type() << " " << gen(tv) << "[" + << genInline(node->size()) << "];\n"; + break; + default: + TORCH_INTERNAL_ASSERT(false, "Unexpected memory type"); + } + } + + void handle(const kir::Sync* node) final { + indent() << "__syncthreads();\n"; + } + + private: + std::stringstream code_; + const Kernel* kernel_; + int block_nest_level_ = 0; + + // TODO(kir): replace with explicit assignment statements + bool print_inline_ = false; +}; + +} // namespace + +std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name) { + FUSER_PERF_SCOPE("generateCudaKernel"); + return CudaKernelGenerator::generateKernelDefinition(kernel, kernel_name); +} + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h new file mode 100644 index 000000000000..562aa1554eb2 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -0,0 +1,22 @@ + +#pragma once + +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace codegen { + +//! Generates a CUDA kernel definition for the given kernel +TORCH_CUDA_API std::string generateCudaKernel( + const Kernel* kernel, + const std::string& kernel_name = "CUDAGeneratedKernel"); + +} // namespace codegen +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 3e0f5303b966..9f8f7aba1cf4 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -20,11 +21,10 @@ ComputeAtData::ComputeAtData(TensorView* tv) void ComputeAtData::clearPass() { // If the last pass set a position, update the new_compute_at_position if // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - new_compute_at_position = - pass_pos > new_compute_at_position ? pass_pos : new_compute_at_position; + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + new_compute_at_position = current_traversal_position; + } current_traversal_position_set = false; current_traversal_position = 0; @@ -52,16 +52,19 @@ void ComputeAtData::setPassPosition(unsigned int pos) { } unsigned int ComputeAtData::getNewPosition() const { - // If the last pass set a position, update the new_compute_at_position if - // latest position would be greater than previously set. - auto pass_pos = current_traversal_position_set ? current_traversal_position - : new_compute_at_position; - - return pass_pos > new_compute_at_position ? pass_pos - : new_compute_at_position; + // If the last pass set a position, return the latest position if + // it would be greater than previously set. + if (current_traversal_position_set && + current_traversal_position > new_compute_at_position) { + return current_traversal_position; + } else { + return new_compute_at_position; + } } void ComputeAtData::validateNewComputeAt() const { + FUSER_PERF_SCOPE("validateNewComputeAt"); + TORCH_INTERNAL_ASSERT( getNewPosition() >= original_compute_at_position, "Invalid computeAt detected. This computeAt would invalidate the set computeAt on ", @@ -82,7 +85,22 @@ void ComputeAtData::validateNewComputeAt() const { "."); } +void ComputeAtData::setComputeAtDomain(TensorDomain* td) { + if (new_compute_at_domain_ != original_domain_) { + TORCH_INTERNAL_ASSERT( + *new_compute_at_domain_ == *td, + "TensorDomain, ", + td, + ", does not match with the previously set domain of ", + tv_ref_, + ", which is ", + new_compute_at_domain_); + } + new_compute_at_domain_ = td; +} + namespace { + // Wrapper around set_intersection template std::set set_intersection(const std::set& set1, const std::set& set2) { @@ -121,12 +139,15 @@ std::deque> tvChains( } return tv_chains; } + } // namespace void ComputeAt::run( TensorView* producer, TensorView* consumer, unsigned int consumer_position) { + FUSER_PERF_SCOPE("ComputeAt::run"); + // Make sure the correct fusion is setup between this and consumer. TORCH_CHECK( producer->fusion() == consumer->fusion(), @@ -160,6 +181,9 @@ void ComputeAt::run( // Check all dependency chains, select the next TV after producer towards // consumer. These are the TVs we're going to actually call computeAt on. for (const auto& tv_chain : all_chains) { + // When a chain only has two tensors, they must be the producer, + // which is an input, and the consumer. There is nothing we need + // to do for such chains. if (tv_chain.size() > 2) { // Make sure we only add once, but we want to add in a determinsitic // order @@ -188,6 +212,8 @@ unsigned int ComputeAt::backwardComputeAt_impl( TensorView* producer, TensorView* consumer, unsigned int consumer_compute_at_axis) { + FUSER_PERF_SCOPE("backwardComputeAt_impl"); + auto& producer_entry = tv_data.at(producer); // Use TensorDomain interface so it doesn't set computeAt automatically @@ -209,6 +235,8 @@ unsigned int ComputeAt::forwardComputeAt_impl( TensorView* producer, TensorView* consumer, unsigned int producer_compute_at_axis) { + FUSER_PERF_SCOPE("forwardComputeAt_impl"); + auto& consumer_entry = tv_data.at(consumer); const auto& producer_entry = tv_data.at(producer); @@ -229,6 +257,8 @@ unsigned int ComputeAt::forwardComputeAt_impl( } void ComputeAt::setCommonConsumer() { + FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer"); + // Convert the first chain to a set. std::set common_consumers( producer_use_chains_.front().begin(), producer_use_chains_.front().end()); @@ -281,6 +311,8 @@ void ComputeAt::setCommonConsumer() { // Similar to backward traversal in traverseAllKnown but we should only apply // computeAt if it will increase computeAt positions. void ComputeAt::traverseBackward() { + FUSER_PERF_SCOPE("ComputeAt::traverseBackward"); + // propagate *backward* through all *producer* use_chains or from *producer* // to common_consumer if common_consumer exists. Only apply transform if // increases computeAt position. @@ -307,6 +339,8 @@ void ComputeAt::traverseBackward() { } void ComputeAt::traverseForward() { + FUSER_PERF_SCOPE("ComputeAt::traverseForward"); + // propagate forward through all *producer* use_chains or from *producer* to // common_consumer if common_consumer exists. auto chains = producer_use_chains_; @@ -338,6 +372,8 @@ void ComputeAt::traverseForward() { } void ComputeAt::runPass() { + FUSER_PERF_SCOPE("ComputeAt::runPass"); + // Initialize tv_data for all TensorViews we may modify auto chains = producer_use_chains_; if (common_consumer_ != nullptr) { @@ -382,6 +418,8 @@ void ComputeAt::runPass() { } void ComputeAt::setupOutputs() { + FUSER_PERF_SCOPE("ComputeAt::setupOutputs"); + if (common_consumer_ != nullptr) return; @@ -421,9 +459,6 @@ ComputeAt::ComputeAt( : producer_(_producer), consumer_(_consumer), consumer_position_(_consumer_position) { - if (consumer_position_ < 0) - consumer_position_ += consumer_->nDims(); - TORCH_INTERNAL_ASSERT( consumer_position_ >= 0 && consumer_position_ <= consumer_->nDims(), "Invalid computeAt axis, received ", diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h index 84677ae99448..a9112a6225ca 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.h +++ b/torch/csrc/jit/codegen/cuda/compute_at.h @@ -56,9 +56,7 @@ class ComputeAtData { // If we set computeAt, save the domain so we can reset it after traversal. // Traversal state can deviate from the domain we will want to save after the // entire computeAt pass. - void setComputeAtDomain(TensorDomain* td) { - new_compute_at_domain_ = td; - } + void setComputeAtDomain(TensorDomain* td); // Return domain set in setComputeAtDomain TensorDomain* getComputeAtDomain() const { diff --git a/torch/csrc/jit/codegen/cuda/docs/.gitignore b/torch/csrc/jit/codegen/cuda/docs/.gitignore new file mode 100644 index 000000000000..1936cc1d441e --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/.gitignore @@ -0,0 +1 @@ +html diff --git a/torch/csrc/jit/codegen/cuda/docs/documentation.h b/torch/csrc/jit/codegen/cuda/docs/documentation.h new file mode 100644 index 000000000000..cfd4435461b9 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/documentation.h @@ -0,0 +1,23 @@ + +#error This is used exclusively for generating the documentation (not a real header) + +//! \namespace torch::jit::fuser +//! \brief Main PyTorch JIT Fuser namespace + +//! \namespace torch::jit::fuser::cuda +//! \brief CUDA specific components + +//! \namespace torch::jit::fuser::cuda::executor_utils +//! \brief Fuser executor related utilities + +//! \namespace torch::jit::fuser::kir +//! \brief Kernel IR + +//! \namespace torch::jit::fuser::ir_utils +//! \brief IR manipulation utilities + +//! \namespace torch::jit::fuser::loop_utils +//! \brief Loop utilities + +//! \namespace torch::jit::fuser::scope_utils +//! \brief Scope utilities diff --git a/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen new file mode 100644 index 000000000000..b9a51b187aa5 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/docs/fuser.doxygen @@ -0,0 +1,2515 @@ +# Doxyfile 1.8.14 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. + +PROJECT_NAME = "PyTorch JIT Fuser" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 0 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. + +# TODO: switch to NO once key concepts are documented +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = NO + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT += .. +INPUT += documentation.h +INPUT += main_page.md + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE += + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS += Ui +EXCLUDE_SYMBOLS += internal +EXCLUDE_SYMBOLS += __* + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = images + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = main_page.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = --std=c++1z + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: 0. + +CLANG_COMPILATION_DATABASE_PATH = 0 + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via Javascript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have Javascript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: https://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://doc.qt.io/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://doc.qt.io/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://doc.qt.io/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://doc.qt.io/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/ + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /