diff --git a/.circleci/cimodel/data/simple/ge_config_tests.py b/.circleci/cimodel/data/simple/ge_config_tests.py index 306f616d3ef7..235c08d62786 100644 --- a/.circleci/cimodel/data/simple/ge_config_tests.py +++ b/.circleci/cimodel/data/simple/ge_config_tests.py @@ -61,25 +61,25 @@ def gen_tree(self): MultiPartVersion([3, 6], "py"), MultiPartVersion([5, 4], "gcc"), None, - ["ge_config_legacy", "test"], + ["jit_legacy", "test"], ["pytorch_linux_xenial_py3_6_gcc5_4_build"]), GeConfigTestJob( MultiPartVersion([3, 6], "py"), MultiPartVersion([5, 4], "gcc"), None, - ["ge_config_simple", "test"], + ["jit_simple", "test"], ["pytorch_linux_xenial_py3_6_gcc5_4_build"], ), GeConfigTestJob( None, None, CudaVersion(10, 2), - ["cudnn7", "py3", "ge_config_legacy", "test"], + ["cudnn7", "py3", "jit_legacy", "test"], ["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"], use_cuda_docker=True, # TODO Why does the build environment specify cuda10.1, while the # job name is cuda10_2? - build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"), + build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test"), ] diff --git a/.circleci/config.yml b/.circleci/config.yml index 06a7188b7c13..208e0d09eed0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7023,23 +7023,23 @@ workflows: requires: - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c - pytorch_linux_test: - build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_legacy-test + build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_legacy-test docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_legacy_test + name: pytorch_linux_xenial_py3_6_gcc5_4_jit_legacy_test requires: - pytorch_linux_xenial_py3_6_gcc5_4_build resource_class: large - pytorch_linux_test: - build_environment: pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test + build_environment: pytorch-linux-xenial-py3.6-gcc5.4-jit_simple-test docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4 - name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test + name: pytorch_linux_xenial_py3_6_gcc5_4_jit_simple_test requires: - pytorch_linux_xenial_py3_6_gcc5_4_build resource_class: large - pytorch_linux_test: - build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test + build_environment: pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7 - name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test + name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_jit_legacy_test requires: - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build resource_class: gpu.medium diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 213750ba7280..8e71738f414e 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -63,7 +63,7 @@ test_python_all() { # Increase default limit on open file handles from 256 to 1024 ulimit -n 1024 - python test/run_test.py --verbose --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --determine-from="$DETERMINE_FROM" + python test/run_test.py --verbose --exclude-jit-executor --determine-from="$DETERMINE_FROM" assert_git_not_dirty } diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 48a0063bbda6..0e35364a2f5d 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -126,23 +126,18 @@ if ([ -n "$CIRCLE_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]); file_diff_from_base "$DETERMINE_FROM" fi -test_python_nn() { - time python test/run_test.py --include test_nn --verbose --determine-from="$DETERMINE_FROM" - assert_git_not_dirty -} - -test_python_ge_config_profiling() { - time python test/run_test.py --include test_jit_cuda_fuser_profiling test_jit_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM" +test_python_legacy_jit() { + time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } -test_python_ge_config_legacy() { - time python test/run_test.py --include test_jit_cuda_fuser_legacy test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM" +test_python_shard1() { + time python test/run_test.py --exclude-jit-executor --shard 1 2 --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } -test_python_all_except_nn_and_cpp_extensions() { - time python test/run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM" +test_python_shard2() { + time python test/run_test.py --exclude-jit-executor --shard 2 2 --verbose --determine-from="$DETERMINE_FROM" assert_git_not_dirty } @@ -304,7 +299,7 @@ test_xla() { assert_git_not_dirty } -# Do NOT run this test before any other tests, like test_python_nn, etc. +# Do NOT run this test before any other tests, like test_python_shard1, etc. # Because this function uninstalls the torch built from branch, and install # nightly version. test_backward_compatibility() { @@ -381,19 +376,17 @@ if [[ "${BUILD_ENVIRONMENT}" == *backward* ]]; then elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then install_torchvision test_xla -elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then - test_python_ge_config_legacy -elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then - test_python_ge_config_profiling +elif [[ "${BUILD_ENVIRONMENT}" == *legacy_jit* || "${JOB_BASE_NAME}" == *legacy_jit* ]]; then + test_python_legacy_jit elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then # TODO: run some C++ tests echo "no-op at the moment" elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then - test_python_nn - test_cpp_extensions + install_torchvision + test_python_shard1 elif [[ "${BUILD_ENVIRONMENT}" == *-test2 || "${JOB_BASE_NAME}" == *-test2 ]]; then install_torchvision - test_python_all_except_nn_and_cpp_extensions + test_python_shard2 test_aten test_libtorch test_custom_script_ops @@ -409,9 +402,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4 test_cpp_extensions else install_torchvision - test_python_nn - test_python_all_except_nn_and_cpp_extensions - test_cpp_extensions + test_python_shard1 + test_python_shard2 test_aten test_vec256 test_libtorch diff --git a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat index 4bfb5bc85e66..d76637dd0db7 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat @@ -1,3 +1,3 @@ call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat -cd test && python run_test.py --exclude test_jit_cuda_fuser_profiling test_jit_cuda_fuser_legacy test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd .. +cd test && python run_test.py --exclude-jit-executor --verbose --determine-from="%1" && cd .. if ERRORLEVEL 1 exit /b 1 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b01184603918..a1b4096592a7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -118,11 +118,37 @@ For example: - modify your Python file `torch/__init__.py` - test functionality -You do not need to repeatedly install after modifying Python files. +You do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall +if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...). In case you want to reinstall, make sure that you uninstall PyTorch first by running `pip uninstall torch` and `python setup.py clean`. Then you can install in `develop` mode again. +### Tips and Debugging +* A prerequisite to installing PyTorch is CMake. We recommend installing it with [Homebrew](https://brew.sh/) +with `brew install cmake` if you are developing on MacOS or Linux system. +* Our `setup.py` requires Python >= 3.6 +* If you run into errors when running `python setup.py develop`, here are some debugging steps: + 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure + your CMake works and can compile this simple Hello World program without errors. + 2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many + details along the way, which saves time the next time you build. If you're running into issues, you can always + `rm -rf build` from the toplevel `pytorch` directory and start over. + 3. If you have made edits to the PyTorch repo, commit any change you'd like to keep and clean the repo with the + following commands (note that clean _really_ removes all untracked files and changes.): + ```bash + git submodule deinit -f . + git clean -xdf + python setup.py clean + git submodule update --init --recursive # very important to sync the submodules + python setup.py develop # then try running the command again + ``` + 4. The main step within `python setup.py develop` is running `make` from the `build` directory. If you want to + experiment with some environment variables, you can pass them into the command: + ```bash + ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* python setup.py develop + ``` + ## Nightly Checkout & Pull The `tools/nightly.py` script is provided to ease pure Python development of diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 5ec9d24eea39..839964e33c59 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -51,6 +51,7 @@ file(GLOB cudnn_cpp "cudnn/*.cpp") file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h") file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp") +list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp") file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip") file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h") file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp") diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index a9182787d2e6..a7b4e694d52e 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -156,18 +156,29 @@ struct FunctionSchema { checkSchema(); } - // check whether this schema is backward compatible with the old one. - // the following conditions are considered as this schema is backward - // compatible with old: - // 1) two schemas are equal - // 2) this schema has the same or more positional args than old, - // and any positional arg in this schema is backward compatible - // with the corresponding one in old schema, which could be an arg - // or a kwarg, if it has, or it must provide a default value - // 3) this schema has the same or more kwargs than old, and all the kwargs - // in old schema can find the corresponding kwarg in this schema which - // is backward compatible with the old kwarg, and the extra kwargs in - // this schema must provide default values. + // Checks whether this schema is backward compatible with the old one. + // The following conditions must be true: + // [Function structure] The new schema's name, overload-name, varargs, and + // return arity are the same. + // [Output Narrowing] The new schema's output type must be the same class + // or inherit from the old schema's output type. + // [Argument count] The new schema must have at least as many arguments as + // the old schema (considering the list of positional and kwargs). + // [Arg Compatibility] Every argument in the old schema has a corresponding + // argument in the new schema that: + // * is at the same position. + // * has the same name. + // * is either positional, or kwarg and the old argument was kwarg. + // * has the same type, or the old argument's type inherits from the + // new argument's type. + // [Default Values] Every new argument must have a default value. + // E.g. + // OK f_new(a, b, c=1) => f_old(a, b) + // NOK f_new(a, c=1, *, b) => f_old(a, *, b) + // OK f_new(a, b, *, c) => f_old(a, *, b, c) + // NOK f_new(a, *, b, c) -> f_old(a, b, *, c) + // NOK f_new(a, *, c, b) => f_old(a, *, b, c) + // OK f_new(a, *, b, c, d=1) => f_old(a, *, b, c) bool isBackwardCompatibleWith( const FunctionSchema& old, std::ostream* why_not = nullptr) const; diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index bc9a68fbad3f..2185b35bc593 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -111,69 +111,35 @@ inline bool FunctionSchema::isBackwardCompatibleWith( return false; } for (size_t i = 0; i < returns().size(); ++i) { - // functions are covariant in arguments but contravariant in returns + // Backwards compatibility requires covariance on argument types + // (i.e. more generic), and contravariance on return types (i.e. + // more specific). if (!old.returns().at(i).isBackwardCompatibleWith( returns().at(i), why_not)) { return false; } } - std::vector args, old_args; - std::map kwargs, old_kwargs; - auto split_func = [](const std::vector& arguments, - std::vector* positionals, - std::map* nameds) { - for (const Argument& arg : arguments) { - if (!arg.kwarg_only()) { - positionals->emplace_back(&arg); - } - nameds->emplace(arg.name(), &arg); - } - }; - // we split args into positional and keyward parts, - split_func(arguments(), &args, &kwargs); - split_func(old.arguments(), &old_args, &old_kwargs); - if (old_args.size() > args.size()) { - return false; - } - // make sure that all the old positional args have their corresponding - // backward compatible positional args in this schema - for (size_t i = 0; i < old_args.size(); ++i) { - if (!args.at(i)->isBackwardCompatibleWith( - *old_args.at(i), - why_not)) { + + // Make sure that all the old arguments have their corresponding backward + // compatible arguments in this schema. + for (size_t i = 0; i < old.arguments().size(); ++i) { + if (!arguments().at(i).isBackwardCompatibleWith( + old.arguments().at(i), why_not)) { return false; } } - // check the extra positional args in this schema either has corresponding - // backward compatible keyward args since positional args also can be used as - // a keyward arg, or provided default values - for (size_t i = old_args.size(); i < args.size(); ++i) { - if (!args.at(i)->default_value()) { - auto it = old_kwargs.find(args.at(i)->name()); - if (it == old_kwargs.end() || - !args.at(i)->isBackwardCompatibleWith( - *it->second, - why_not)) { - return false; + + // Validate that all new arguments provided a default value. + for (size_t i = old.arguments().size(); i < arguments().size(); ++i) { + if (!arguments().at(i).default_value()) { + if (why_not) { + *why_not + << "Function schema not backward compatible since the new argument '" + << arguments().at(i).name() << "' of type " + << arguments().at(i).type()->str() + << " did not provide a default value."; } - } - } - // make sure that all the keyword args in the old schema have their - // corresponding backward compatible keyward args in this schema - for (auto& kv : old_kwargs) { - auto it = kwargs.find(kv.first); - if (it == kwargs.end() || - !it->second->isBackwardCompatibleWith( - *kv.second, - why_not)) { - return false; - } - kwargs.erase(it); - } - // check all the extra keyword args in this schema provide default values - for (auto& kv : kwargs) { - if (!kv.second->default_value()) { return false; } } @@ -186,7 +152,6 @@ inline void FunctionSchema::checkArg( const Argument& argument, optional pos) const { if (!value.type()->isSubtypeOf(argument.type())) { - std::string position = pos ? ::c10::str(" in position ", *pos) : ""; TORCH_CHECK( false, formatTypeMismatchMsg( diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 58f6a8d53e92..28b9738034e7 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -28,6 +28,10 @@ #include #endif +#ifndef USE_ROCM +#include +#endif + #include #include @@ -116,10 +120,14 @@ bool CUDAHooks::hasCuDNN() const { return AT_CUDNN_ENABLED(); } -#ifdef USE_DIRECT_NVRTC +#if defined(USE_DIRECT_NVRTC) static std::pair, at::cuda::NVRTC*> load_nvrtc() { return std::make_pair(nullptr, at::cuda::load_nvrtc()); } +#elif !defined(USE_ROCM) +static std::pair, at::cuda::NVRTC*> load_nvrtc() { + return std::make_pair(nullptr, &at::cuda::detail::lazyNVRTC); +} #else static std::pair, at::cuda::NVRTC*> load_nvrtc() { #if defined(_WIN32) diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp new file mode 100644 index 000000000000..fae48c08b61f --- /dev/null +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp @@ -0,0 +1,171 @@ +#include + +#include +#include +#include + +namespace at { +namespace cuda { +namespace detail { +namespace _stubs { + +at::DynamicLibrary& getCUDALibrary() { +#if defined(_WIN32) + static at::DynamicLibrary lib("nvcuda.dll"); +#else + static at::DynamicLibrary lib("libcuda.so.1"); +#endif + return lib; +} + +at::DynamicLibrary& getNVRTCLibrary() { + constexpr auto major = CUDA_VERSION / 1000; + constexpr auto minor = ( CUDA_VERSION / 10 ) % 10; +#if defined(_WIN32) + auto libname = std::string("nvrtc64_") + std::to_string(major) + std::to_string(minor) + "_0.dll"; +#else + static auto libname = std::string("libnvrtc.so.") + std::to_string(major) + "." + std::to_string(minor); +#endif + static at::DynamicLibrary lib(libname.c_str()); + return lib; +} + +#define _STUB_1(LIB, NAME, RETTYPE, ARG1) \ +RETTYPE NAME(ARG1 a1) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1); \ +} + +#define _STUB_2(LIB, NAME, RETTYPE, ARG1, ARG2) \ +RETTYPE NAME(ARG1 a1, ARG2 a2) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1, a2); \ +} + +#define _STUB_3(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3) \ +RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1, a2, a3); \ +} + +#define _STUB_4(LIB, NAME, RETTYPE, ARG1, ARG2, ARG3, ARG4) \ +RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) { \ + auto fn = reinterpret_cast(get## LIB ## Library().sym(__func__)); \ + if (!fn) \ + throw std::runtime_error("Can't get " C10_STRINGIZE(NAME) ); \ + lazyNVRTC.NAME = fn; \ + return fn(a1, a2, a3, a4); \ +} + +#define CUDA_STUB1(NAME, A1) _STUB_1(CUDA, NAME, CUresult CUDAAPI, A1) +#define CUDA_STUB2(NAME, A1, A2) _STUB_2(CUDA, NAME, CUresult CUDAAPI, A1, A2) +#define CUDA_STUB3(NAME, A1, A2, A3) _STUB_3(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3) +#define CUDA_STUB4(NAME, A1, A2, A3, A4) _STUB_4(CUDA, NAME, CUresult CUDAAPI, A1, A2, A3, A4) + +#define NVRTC_STUB1(NAME, A1) _STUB_1(NVRTC, NAME, nvrtcResult, A1) +#define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2) +#define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3) + +NVRTC_STUB2(nvrtcVersion, int*, int*); +NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const); + +nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, + const char * const *headers, + const char * const *includeNames) { + auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get nvrtcCreateProgram"); + lazyNVRTC.nvrtcCreateProgram = fn; + return fn(prog, src, name, numHeaders, headers, includeNames); +} + +NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *); +NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *); +NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); +NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); +_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); +NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); +NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); +NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **); + +CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *); +CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *); +CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t); +CUDA_STUB2(cuGetErrorString, CUresult, const char **); +CUDA_STUB1(cuCtxGetCurrent, CUcontext *); +CUDA_STUB1(cuModuleUnload, CUmodule); +CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *); +CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *); +CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *); + +// Irregularly shaped functions +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra) { + auto fn = reinterpret_cast(getCUDALibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get cuLaunchKernel"); + lazyNVRTC.cuLaunchKernel = fn; + return fn(f, + gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes, hStream, kernelParams, extra); +} + +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, + const void *image, + unsigned int numOptions, + CUjit_option *options, + void **optionValues) { + auto fn = reinterpret_cast(getCUDALibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get cuModuleLoadDataEx"); + lazyNVRTC.cuModuleLoadDataEx = fn; + return fn(module, image, numOptions, options, optionValues); +} + +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, + CUjitInputType type, + void *data, + size_t size, + const char *name, + unsigned int numOptions, + CUjit_option *options, + void **optionValues) { + auto fn = reinterpret_cast(getCUDALibrary().sym(__func__)); + if (!fn) + throw std::runtime_error("Can't get cuLinkAddData"); + lazyNVRTC.cuLinkAddData = fn; + return fn(state, type, data, size, name, numOptions, options, optionValues); +} + +} // namespace _stubs + +NVRTC lazyNVRTC = { +#define _REFERENCE_MEMBER(name) _stubs::name, + AT_FORALL_NVRTC(_REFERENCE_MEMBER) +#undef _REFERENCE_MEMBER +}; +} // namespace detail +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.h b/aten/src/ATen/cuda/detail/LazyNVRTC.h new file mode 100644 index 000000000000..810e1c322dbd --- /dev/null +++ b/aten/src/ATen/cuda/detail/LazyNVRTC.h @@ -0,0 +1,11 @@ +#pragma once +#include +namespace at { namespace cuda { +// Forward-declares at::cuda::NVRTC +struct NVRTC; + +namespace detail { +extern NVRTC lazyNVRTC; +} + +}} // at::cuda::detail diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 17e24a38fdc7..e7e5659babbb 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -981,20 +981,12 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT, auto m = self.size(-2); auto n = self.size(-1); auto mn = std::min(m, n); - Tensor iwork = at::empty({8*mn}, at::kInt); + Tensor iwork = at::empty({8 * mn}, at::kInt); auto iwork_data = iwork.data_ptr(); Tensor rwork; value_t* rwork_data = nullptr; if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { - auto mx = std::max(m, n); - int64_t lrwork; // These settings are valid for on LAPACK 3.6+ - if (jobz == 'N'){ - lrwork = 7 * mn; - }else if (mx > 10 * mn){ - lrwork = 7 * mn * mn + 7 * mn; - } else { - lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn); - } + auto lrwork = computeLRWorkDim(jobz, m, n); // rwork is an array of floats or doubles depending on the type rwork = at::empty({std::max(int64_t(1), lrwork)}, at::typeMetaToScalarType(S.dtype())); rwork_data = rwork.data_ptr(); diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 5c07700f1e85..4a6af18a5a96 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -318,4 +318,19 @@ static inline std::vector create_reverse_permutation(std::vector 10 * mn) { + return 5 * mn * mn + 5 * mn; + } + return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn); +} + }} // namespace at::native diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index 4a9c8cd7ad1a..a4eb075a5c3c 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -71,9 +71,11 @@ void replication_pad1d_out_cpu_template( int pad_l = paddingSize[0]; int pad_r = paddingSize[1]; - TORCH_CHECK(input_.numel() > 0 - && (input_.ndimension() == 2 || input_.ndimension() == 3), - "non-empty 2D or 3D (batch mode) tensor expected for input"); + // allow empty batch size but not other dimensions. + TORCH_CHECK((input_.dim() == 2 && input_.size(0) != 0 && input_.size(1) != 0) || + (input_.dim() == 3 && input_.size(1) != 0 && input_.size(2) != 0), + "Expected 2D or 3D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input_.sizes()); if (input_.ndimension() == 3) { @@ -91,7 +93,6 @@ void replication_pad1d_out_cpu_template( "input (W: ", iwidth, ") is too small." " Calculated output W: ", owidth); - /* get contiguous input */ auto input = input_.contiguous(); @@ -216,6 +217,9 @@ Tensor& replication_pad1d_backward_out_cpu_template( /* get contiguous gradOutput */ auto gradOutput = gradOutput_.contiguous(); gradInput.resize_as_(input); + if (gradInput.numel() == 0) { + return gradInput; + } gradInput.zero_(); /* backprop */ @@ -339,8 +343,13 @@ void replication_pad2d_out_cpu_template(Tensor& output, int dimslices = 0; int64_t nbatch = 1; - TORCH_CHECK(input_.numel() > 0 && (input_.dim() == 3 || input_.dim() == 4), - "3D or 4D (batch mode) tensor expected for input, but got: ", input_); + // allow 0 dim batch size and nothing else. + bool valid_dims = input_.size(1) != 0 && input_.size(2) != 0; + TORCH_CHECK( + (input_.dim() == 3 && input_.size(0) != 0 && valid_dims) || + (input_.dim() == 4 && valid_dims && input_.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input_.sizes()); if (input_.dim() == 4) { @@ -510,6 +519,10 @@ Tensor& replication_pad2d_backward_out_cpu_template( /* resize */ gradInput.resize_as_(input); + if (gradInput.numel() == 0) { + return gradInput; + } + gradInput.zero_(); /* backprop */ @@ -557,8 +570,13 @@ static inline void shapeCheck3d( int dimd = 1; int dimslices = 0; - TORCH_CHECK(input.numel() > 0 && (input.dim() == 4 || input.dim() == 5), - "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input); + // allow batch size of 0-dim. + bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0; + TORCH_CHECK( + (input.dim() == 4 && input.size(0) != 0 && valid_dims) || + (input.dim() == 5 && valid_dims && input.size(4) != 0), + "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); if (input.dim() == 5) { @@ -872,6 +890,9 @@ Tensor& replication_pad3d_backward_out_cpu_template( /* resize */ gradInput.resize_as_(input); + if (gradInput.numel() == 0) { + return gradInput; + } gradInput.zero_(); /* backprop */ diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index cf8d52a7a3c2..21e4d63b163b 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -409,6 +409,70 @@ Tensor fft_irfftn(const Tensor& self, c10::optional s, return native::fft_irfft(x, last_shape, last_dim, norm); } +Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) { + ScalarType dtype = typeMetaToScalarType(options.dtype()); + TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), + "fftfreq requires a floating point or complex dtype"); + // TODO: arange doesn't have complex support + Tensor result = native::arange(n, options); + auto right_slice = result.slice(0, (n + 1) / 2, 0); + at::arange_out(right_slice, -(n/2), 0, 1); + result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) + return result; +} + +Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) { + ScalarType dtype = typeMetaToScalarType(options.dtype()); + TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), + "rfftfreq requires a floating point or complex dtype"); + // TODO: arange doesn't have complex support + Tensor result = native::arange(n/2 + 1, options); + result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) + return result; +} + +// If an array dim is specified, wraps them according to self.dim(). +// Otherwise returns a vector of all dims. +DimVector default_alldims(const Tensor& self, c10::optional dim_opt) { + DimVector dim; + if (dim_opt) { + IntArrayRef dim_unwrapped = *dim_opt; + dim.resize(dim_unwrapped.size()); + for (int64_t i = 0; i < dim.size(); ++i) { + dim[i] = maybe_wrap_dim(dim_unwrapped[i], self.dim()); + } + } else { + dim.resize(self.dim()); + std::iota(dim.begin(), dim.end(), 0); + } + return dim; +} + +Tensor fft_fftshift(const Tensor& x, c10::optional dim_opt) { + auto dim = default_alldims(x, dim_opt); + + IntArrayRef x_sizes = x.sizes(); + DimVector shift(dim.size()); + for (int64_t i = 0; i < dim.size(); ++i) { + shift[i] = x_sizes[dim[i]] / 2; + } + + return at::roll(x, shift, dim); +} + +Tensor fft_ifftshift(const Tensor& x, c10::optional dim_opt) { + auto dim = default_alldims(x, dim_opt); + + IntArrayRef x_sizes = x.sizes(); + DimVector shift(dim.size()); + for (int64_t i = 0; i < dim.size(); ++i) { + shift[i] = (x_sizes[dim[i]] + 1) / 2; + } + + return at::roll(x, shift, dim); +} + + // This is a pass-through wrapper function that does the size check and // inferences. The actual forward implementation function is called // at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU). diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index 114ca93dae26..34911a2975e4 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -104,7 +104,11 @@ struct Dist { // Special general pnorm derivative if p is less than two struct lttdist_calc { - static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { return dist == 0.0 ? Vec(0) : sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1)); } + static inline Vec backward(const Vec& diff, const scalar_t grad, const scalar_t dist, const Vec& p) { + Vec result = (dist == 0.0) ? Vec(0) : (sign(diff) * diff.abs().pow(p - Vec(1)) * Vec(grad) / Vec(dist).pow(p - Vec(1))); + result = Vec::blendv(result, Vec(0), (diff == Vec(0)) & (p < Vec(1))); + return result; + } }; // Two norm diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index 2a628bb54925..e9dfe2d9285d 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -1420,17 +1420,8 @@ AT_ERROR("svd: MAGMA library not found in " magma_int_t* iwork; ALLOCATE_ARRAY(iwork, magma_int_t, 8 * mn); - // Copy-n-paste rwork size computation from BatchLinearAlgebra.cpp if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { - auto mx = std::max(m, n); - int64_t lrwork; // These settings are valid for on LAPACK 3.6+ - if (jobz == MagmaNoVec){ - lrwork = 7 * mn; - } else if (mx > 10 * mn){ - lrwork = 7 * mn * mn + 7 * mn; - } else { - lrwork = std::max(7 * mn * mn + 7 * mn, 2 * mx * mn + 2 *mn * mn + mn); - } + auto lrwork = computeLRWorkDim(jobchar, m, n); storage_rwork = pin_memory(lrwork); rwork = static_cast(storage_rwork.data()); } diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu index 385cac5c79e8..c43a2ae9877e 100644 --- a/aten/src/ATen/native/cuda/DistanceKernel.cu +++ b/aten/src/ATen/native/cuda/DistanceKernel.cu @@ -50,7 +50,9 @@ struct dists { // Special case backward when p is less than two struct lt_two { - static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { return dist == 0.0 ? 0 : sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1); } + static __forceinline__ __device__ scalar_t backward(const scalar_t diff, const scalar_t grad, const scalar_t dist, const scalar_t p) { + return (dist == 0.0 || (diff == 0.0 && p < 1)) ? 0 : (sign(diff) * std::pow(std::abs(diff), p - 1) * grad / std::pow(dist, p - 1)); + } }; // Two norm diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu index 515dc61eca64..b896a47afed9 100644 --- a/aten/src/ATen/native/cuda/ReplicationPadding.cu +++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu @@ -217,14 +217,17 @@ void replication_pad1d_out_cuda_template( int numBatch = 1; int numInputDims = input.ndimension(); - TORCH_CHECK(input.numel() > 0 && (numInputDims == 2 || numInputDims == 3), - "2D or 3D (batch mode) tensor expected for input") - - if (numInputDims == 3) { - numBatch = input.size(0); - planeDim++; - dimw++; - } + TORCH_CHECK( + (numInputDims == 2 && input.size(0) != 0 && input.size(1) != 0) || + (numInputDims == 3 && input.size(1) != 0 && input.size(2) != 0), + "Expected 2D or 3D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); + + if (numInputDims == 3) { + numBatch = input.size(0); + planeDim++; + dimw++; + } int numPlanes = input.size(planeDim); int inputW = input.size(dimw); @@ -234,13 +237,19 @@ void replication_pad1d_out_cuda_template( "input (W: ", inputW, ")is too small." " Calculated output W: ", outputW); + if (numInputDims == 2) { + output.resize_({numPlanes, outputW}); + } else { + output.resize_({numBatch, numPlanes, outputW}); + } + + if (input.numel() == 0) { + return; + } AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "replication_pad1d_cuda", [&] { - - if (numInputDims == 2) { - output.resize_({numPlanes, outputW}); auto input_ = input.unsqueeze(0); auto output_ = output.unsqueeze(0); auto devInput = input_.packed_accessor64(); @@ -255,7 +264,6 @@ void replication_pad1d_out_cuda_template( replication_pad_forward_kernel1d <<>>(devInput, devOutput, padL, padR); } else { - output.resize_({numBatch, numPlanes, outputW}); auto devInput = input.packed_accessor64(); auto devOutput = output.packed_accessor64(); @@ -304,6 +312,9 @@ void replication_pad1d_backward_out_cuda_template( gradOutput.size(dimw)); gradInput.resize_as_(input); + if (gradInput.numel() == 0) { + return; + } gradInput.zero_(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( @@ -351,9 +362,12 @@ void replication_pad2d_out_cuda_template( int numBatch = 1; int numInputDims = input.dim(); - TORCH_CHECK(input.numel() && (numInputDims == 3 || numInputDims == 4), - "non-empty 3D or 4D (batch mode) tensor expected for input, but got: ", - input) + bool valid_dims = input.size(1) != 0 && input.size(2) != 0; + TORCH_CHECK( + (numInputDims == 3 && input.size(0) != 0 && valid_dims) || + (numInputDims == 4 && valid_dims && input.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); if (numInputDims == 4) { numBatch = input.size(0); @@ -372,12 +386,21 @@ void replication_pad2d_out_cuda_template( "input (H: ", inputH, ", W: ", inputW, ") is too small." " Calculated output H: ", outputH, " W: ", outputW); + if (numInputDims == 3) { + output.resize_({numPlanes, outputH, outputW}); + } else { + output.resize_({numBatch, numPlanes, outputH, outputW}); + } + + if (input.numel() == 0) { + return; + } + AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "replication_pad2d_cuda", [&] { if (numInputDims == 3) { - output.resize_({numPlanes, outputH, outputW}); auto input_ = input.unsqueeze(0); auto output_ = output.unsqueeze(0); auto devInput = input_.packed_accessor64(); @@ -393,7 +416,6 @@ void replication_pad2d_out_cuda_template( at::cuda::getCurrentCUDAStream()>>>( devInput, devOutput, padT, padB, padL, padR); } else { - output.resize_({numBatch, numPlanes, outputH, outputW}); auto devInput = input.packed_accessor64(); auto devOutput = output.packed_accessor64(); @@ -452,6 +474,9 @@ void replication_pad2d_backward_out_cuda_template( gradOutput.size(dimh)); gradInput.resize_as_(input); + if (gradInput.numel() == 0) { + return; + } gradInput.zero_(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( @@ -488,8 +513,12 @@ static inline void shapeCheck3d( "input tensor must fit into 32-bit index math"); int numInputDims = input.dim(); - TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5), - "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input); + bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0; + TORCH_CHECK( + (numInputDims == 4 && input.size(0) != 0 && valid_dims) || + (numInputDims == 5 && valid_dims && input.size(4) != 0), + "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); int planeDim = 0; int dimd = 1; @@ -526,8 +555,12 @@ static inline void shapeAndGradOutputCheck3d( "input tensor must fit into 32-bit index math"); int numInputDims = input.dim(); - TORCH_CHECK(input.numel() && (numInputDims == 4 || numInputDims == 5), - "non-empty 4D or 5D (batch mode) tensor expected for input, but got: ", input); + bool valid_dims = input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0; + TORCH_CHECK( + (numInputDims == 4 && valid_dims) || + (numInputDims == 5 && valid_dims && input.size(4) != 0), + "Expected 4D or 5D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); int planeDim = 0; int dimd = 1; @@ -608,11 +641,20 @@ void replication_pad3d_out_cuda_template( int outputH = inputH + ptop + pbottom; int outputW = inputW + pleft + pright; + if (numInputDims == 4) { + output.resize_({numPlanes, outputD, outputH, outputW}); + } else { + output.resize_({numBatch, numPlanes, outputD, outputH, outputW}); + } + + if (input.numel() == 0) { + return; + } + AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "replication_pad3d_cuda", [&] { if (numInputDims == 4) { - output.resize_({numPlanes, outputD, outputH, outputW}); auto input_ = input.unsqueeze(0); auto output_ = output.unsqueeze(0); auto devInput = input_.packed_accessor64(); @@ -629,7 +671,6 @@ void replication_pad3d_out_cuda_template( at::cuda::getCurrentCUDAStream()>>>( devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright); } else { - output.resize_({numBatch, numPlanes, outputD, outputH, outputW}); auto devInput = input.packed_accessor64(); auto devOutput = output.packed_accessor64(); @@ -679,6 +720,9 @@ void replication_pad3d_backward_out_cuda_template( } gradInput.resize_as_(input); + if (gradInput.numel() == 0) { + return; + } gradInput.zero_(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 64fbf7a147e3..c27cb4083ac2 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -8211,6 +8211,26 @@ use_c10_dispatcher: full variants: function +- func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + +- func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor + python_module: fft + use_c10_dispatcher: full + variants: function + - func: fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor use_c10_dispatcher: full variants: function, method diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index 43d0fc8ccd92..9f69c9d6ad6f 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -79,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp) list(APPEND ATen_MOBILE_TEST_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp) list(APPEND ATen_VEC256_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp ) # Caffe2 specific tests diff --git a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp new file mode 100644 index 000000000000..d3391425e14b --- /dev/null +++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp @@ -0,0 +1,167 @@ +#include + +#include +#include + +at::Tensor run_with_control_flow( + at::Tensor input, + at::Tensor conv_weight, + at::Tensor linear_weight, + bool cond, + std::vector& pointers, + bool record = false, + bool validate = false) { + if (cond) { + input = input * 2; + } + void* input_ptr = input.data_ptr(); + auto conv_out = at::conv2d(input, conv_weight); + void* conv_out_ptr = input.data_ptr(); + auto conv_out_flat = conv_out.view({conv_out.size(0), -1}); + auto output = at::linear(conv_out_flat, linear_weight); + if (record) { + pointers.push_back(input_ptr); + pointers.push_back(conv_out_ptr); + } + if (validate) { + TORCH_CHECK(input_ptr == pointers[0]); + TORCH_CHECK(conv_out_ptr == pointers[1]); + } + return output; +} + +TEST(CPUAllocationPlanTest, with_control_flow) { + at::Tensor a = at::rand({23, 16, 16, 16}); + at::Tensor conv_weight = at::rand({16, 16, 3, 3}); + // output shape + // 23, 16, 14, 14 + // Flattened shape = 23, 3136 + at::Tensor linear_weight = at::rand({32, 3136}); + at::Tensor output; + std::vector pointers; + + auto valid_allocation_plan = [&]() { + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = run_with_control_flow( + a, conv_weight, linear_weight, true, pointers); + } + }; + ASSERT_NO_THROW(valid_allocation_plan()); + + auto validate_allocation_plan = + [&](bool record_mode, bool validation_mode) -> bool { + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = + run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers); + } + bool success{true}; + for (uint64_t i = 0; i < 10; ++i) { + bool validation_success; + { + c10::WithValidateAllocationPlanGuard + validation_guard(&plan, &validation_success); + output = run_with_control_flow( + a, conv_weight, linear_weight, validation_mode, pointers); + } + success = success && validation_success; + } + return success; + }; + ASSERT_FALSE(validate_allocation_plan(false, true)); + ASSERT_FALSE(validate_allocation_plan(true, false)); + ASSERT_TRUE(validate_allocation_plan(true, true)); + ASSERT_TRUE(validate_allocation_plan(false, false)); +} + +TEST(CPUAllocationPlanTest, with_profiling_alloc) { + at::Tensor a = at::rand({23, 16, 16, 16}); + at::Tensor conv_weight = at::rand({16, 16, 3, 3}); + // output shape + // 23, 16, 14, 14 + // Flattened shape = 23, 3136 + at::Tensor linear_weight = at::rand({32, 3136}); + at::Tensor output; + std::vector pointers; + + auto valid_allocation_plan = [&]() { + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = run_with_control_flow( + a, conv_weight, linear_weight, false, pointers); + } + }; + ASSERT_NO_THROW(valid_allocation_plan()); + + auto validate_allocation_plan = + [&](bool record_mode, + bool validation_mode, + bool validate_pointers) { + pointers.clear(); + c10::AllocationPlan plan; + { + c10::WithProfileAllocationsGuard profile_guard(&plan); + output = run_with_control_flow( + a, + conv_weight, + linear_weight, + record_mode, + pointers, + false, + false); + } + c10::CPUProfilingAllocator profiling_allocator; + { + c10::WithProfilingAllocatorGuard + profiling_allocator_guard(&profiling_allocator, &plan); + output = run_with_control_flow( + a, + conv_weight, + linear_weight, + validation_mode, + pointers, + validate_pointers, + false); + } + for (uint64_t i = 0; i < 10; ++i) { + { + c10::WithProfilingAllocatorGuard + profiling_allocator_guard(&profiling_allocator, &plan); + output = run_with_control_flow( + a, + conv_weight, + linear_weight, + validation_mode, + pointers, + false, + validate_pointers); + } + } + }; + // When control flow conditions are same between profiling and evaluation + // profiling allocator should not throw. + ASSERT_NO_THROW(validate_allocation_plan(true, true, false)); + ASSERT_NO_THROW(validate_allocation_plan(false, false, false)); + // Furthermore profiling allocator should return the same pointers + // back for the intermediate tensors + ASSERT_NO_THROW(validate_allocation_plan(true, true, true)); + ASSERT_NO_THROW(validate_allocation_plan(false, false, true)); + + // When control flow conditions are different between profiling and evaluation + // profiling allocator should throw. + ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error); + ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error); +} + +int main(int argc, char* argv[]) { +// At the moment caching allocator is only exposed to mobile cpu allocator. +#ifdef C10_MOBILE + ::testing::InitGoogleTest(&argc, argv); + at::manual_seed(42); + return RUN_ALL_TESTS(); +#endif /* C10_Mobile */ +} diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index 5502aaf4b3d6..c76fefe21d27 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -1,6 +1,7 @@ #include #include #include +#include // TODO: rename flags to C10 C10_DEFINE_bool( @@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator { // TODO: enable with better TLS support on mobile // profiledCPUMemoryReporter().Delete(pointer); auto allocator_ptr = GetThreadLocalCachingAllocator(); + auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator(); if (allocator_ptr != nullptr) { allocator_ptr->free(pointer); + } else if (profiling_allocator_ptr != nullptr) { + profiling_allocator_ptr->free(pointer); } else { c10::free_cpu(pointer); // This adds extra cost to freeing memory to the default case when // caching allocator is not enabled. CPUCachingAllocator::record_free(pointer); + auto allocation_planner = GetThreadLocalAllocationPlanner(); + if (allocation_planner != nullptr) { + allocation_planner->record_free(pointer); + } } } @@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator { auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes; void* data; auto allocator_ptr = GetThreadLocalCachingAllocator(); + auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator(); if (allocator_ptr != nullptr) { data = allocator_ptr->allocate(alloc_size); + } else if (profiling_allocator_ptr != nullptr) { + data = profiling_allocator_ptr->allocate(alloc_size); } else { data = c10::alloc_cpu(alloc_size); + auto allocation_planner = GetThreadLocalAllocationPlanner(); + if (allocation_planner != nullptr) { + allocation_planner->record_allocation(alloc_size, data); + } } // profiledCPUMemoryReporter().New(data, alloc_size); return { diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h index 516aebba0747..f7f5b4f867a9 100644 --- a/c10/core/impl/DeviceGuardImplInterface.h +++ b/c10/core/impl/DeviceGuardImplInterface.h @@ -209,7 +209,15 @@ class C10_API DeviceGuardImplRegistrar { static ::c10::impl::DeviceGuardImplRegistrar C10_ANONYMOUS_VARIABLE(g_##DeviceType)(::c10::DeviceType::DevType, new DeviceGuardImpl()); inline const DeviceGuardImplInterface* getDeviceGuardImpl(DeviceType type) { - auto p = device_guard_impl_registry[static_cast(type)].load(); + // Two adjacent int16_t fields DeviceType and DeviceIndex has field access + // miscompiled on NVCC. To workaround this issue, we apply a mask to the + // DeviceType. First check if the DeviceType is 16-bit. + // FB employees can see + // https://fb.workplace.com/groups/llvm.gcc/permalink/4053565044692080/ + // for more details + static_assert(sizeof(DeviceType) == 2, "DeviceType is not 16-bit"); + auto p = device_guard_impl_registry[static_cast(type) & 0xFFFF].load(); + // This seems to be the first place where you make use of a device // when you pass devices to factory functions. Give a nicer error // message in this case. diff --git a/c10/macros/Export.h b/c10/macros/Export.h index 5888207c5f80..966dd22e08fa 100644 --- a/c10/macros/Export.h +++ b/c10/macros/Export.h @@ -113,8 +113,8 @@ #define TORCH_HIP_API C10_IMPORT #endif -// Enums only need to be exported on windows -#ifdef _WIN32 +// Enums only need to be exported on windows for non-CUDA files +#if defined(_WIN32) && defined(__CUDACC__) #define C10_API_ENUM C10_API #else #define C10_API_ENUM diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index f8e703e08746..4bd46fe97dfe 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -193,11 +193,14 @@ namespace at { namespace cuda { using namespace c10::hip; }} #define C10_DEVICE __device__ #define C10_HOST __host__ // constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) -// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5) -// but 2048 for previous architectures. You'll get warnings if you exceed these constants. +// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5), +// 1536 for Geforce Ampere (8.6), +// and 2048 for all other architectures. You'll get warnings if you exceed these constants. // Hence, the following macros adjust the input values from the user to resolve potential warnings. #if __CUDA_ARCH__ == 750 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; +#elif __CUDA_ARCH__ == 860 +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; #else constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; #endif diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp new file mode 100644 index 000000000000..3559c8ce280f --- /dev/null +++ b/c10/mobile/CPUProfilingAllocator.cpp @@ -0,0 +1,410 @@ +#include + +#include + +namespace c10 { + +namespace { +thread_local AllocationPlanner* allocation_planner{nullptr}; +thread_local CPUProfilingAllocator* profiling_allocator{nullptr}; + +struct MemBlock { + uint64_t start_offset, end_offset; + MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {} + bool operator<(const MemBlock& other) const { + return end_offset <= other.start_offset; + } +}; + +bool validate_allocation_plan( + const std::vector& allocation_sizes, + const std::vector& allocation_offsets) { + std::set allocations; + for (uint64_t i = 0; i < allocation_sizes.size(); ++i) { + // Skip allocations not managed by AllocationPlan + if (allocation_offsets[i] == std::numeric_limits::max()) { + continue; + } + auto start_offset = allocation_offsets[i]; + auto end_offset = allocation_offsets[i] + allocation_sizes[i]; + if (!allocations.emplace(start_offset, end_offset).second) { + return false; + } + } + return true; +} + +enum class EventType { + Allocate = 0, + Free, + Invalid +}; + +struct MemEvent { + uint64_t time; + uint64_t allocation_id; + uint64_t size; + EventType type{EventType::Invalid}; + MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) : + time(t), allocation_id(id), size(s), type(e) {} +}; + +std::vector create_and_sort_mem_events( + const std::vector& allocation_sizes, + const std::vector& allocation_lifetimes) { + std::vector events; + for (uint64_t i = 0; i < allocation_sizes.size(); ++i) { + // If observed allocation are freed outside the scope of + // observation, then allocations are not managed by the + // AllocationPlan. + if (allocation_lifetimes[i] == std::numeric_limits::max()) { + continue; + } + events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate); + events.emplace_back(allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free); + } + std::sort( + events.begin(), + events.end(), + [](const MemEvent& a, + const MemEvent& b) -> bool {return a.time < b.time;}); + return events; +} + +std::vector formulate_greedy_allocation_plan( + const std::vector& allocation_sizes, + const std::vector& allocation_lifetimes) { + // Step 1. Construct all allocation/free events. + // Sort these events by timestamp. + // Step 2. Iterate through all events. + // 2.1 If allocate event: + // Find all candidate in free_size_to_offset map + // Greedily pick the first one. + // Remove the entry from free_size_to_offset map. + // new_offset = offset + request_size + // new_size = size - request_size + // Add new entry to both maps + // 2.2 If free event. + // Check if the returned offset merges with another chunk. + // If so merge until no more merging is possible. + // If returned offset does not merge, then + // just return it as a chunk. + + // lower_bound on this map will get all candidates of + // the right size for allocation. + std::map free_size_to_offset; + // This provides fast lookup when we want to insert freed block + // back, especially when we want to merge blocks. + ska::flat_hash_map::iterator> free_start_offset_to_size_iter; + ska::flat_hash_map::iterator> free_end_offset_to_size_iter; + // Upon free end_ptr = offset + size + // If end_ptr exists merge freed allocation + // Also find coresponding offset in size_to_offet + // Remove that entry and update with new size and offset + // If end_ptr does not exist then just insert offset,size + // in map and correspondingly size, offset in the other map. + // Merging should always be done recursively until no more chunks + // that can be found. + // After last free we should have only one entry left in these maps. + ska::flat_hash_map allocated_offset_to_size; + + std::vector allocation_offsets( + allocation_sizes.size(), std::numeric_limits::max()); + auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes); + uint64_t max_offset{0}; + for (const auto& mem_event : mem_events) { + uint64_t alloc_offset; + uint64_t new_offset, new_size; + if (mem_event.type == EventType::Allocate) { + auto it = free_size_to_offset.lower_bound(mem_event.size); + if (it == free_size_to_offset.end()) { + // If there is no contiguous block of the size requested + // allocate a new one. + alloc_offset = max_offset; + max_offset += mem_event.size; + allocated_offset_to_size.emplace(alloc_offset, mem_event.size); + } else { + // If we have found a block of the size we want + // 1. change the block by allocating out of it. + // 1.1 Erase the entire block + // 1.2 Erase the reverse map entries + // 2. If block still has space left insert the remainder back in map. + // Including reverse map entries. + // 3. Insert the allocated block in allocated_offset_to_size. + alloc_offset = it->second; + new_offset = alloc_offset + mem_event.size; + new_size = it->first - mem_event.size; + free_size_to_offset.erase(it); + free_start_offset_to_size_iter.erase(alloc_offset); + free_end_offset_to_size_iter.erase(alloc_offset + it->first); + if (new_size > 0) { + auto ref_it = free_size_to_offset.emplace(new_offset, new_size).first; + free_start_offset_to_size_iter.emplace(new_offset, ref_it); + free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it); + } + allocated_offset_to_size.emplace(alloc_offset, mem_event.size); + } + allocation_offsets[mem_event.allocation_id] = alloc_offset; + } else { + // 1. Check if freed block is adjancent to an existing free block + // at its end boundary. This is done by checking + // free_end_offset_to_size_iter. + // If we find such a block, remove it and adjust size of + // the block being freed. + // 2. Similarly check if freed block is adjacent to an existing + // free block at start boundary. This is done by checking + // free_start_offset_to_size_iter. + // If we find such a block, remove it and adjust size of + // the block being freed. + // 3. Inser the freed block in map. + auto freed_offset = allocation_offsets[mem_event.allocation_id]; + auto freed_size = mem_event.size; + auto end_offset = freed_offset + freed_size; + // Merge when another free block exist at the end of this block + auto end_it = free_end_offset_to_size_iter.find(end_offset); + if (end_it != free_end_offset_to_size_iter.end()) { + auto size_to_end_offset_iter = end_it->second; + freed_size += size_to_end_offset_iter->first; + free_size_to_offset.erase(size_to_end_offset_iter); + free_end_offset_to_size_iter.erase(end_it); + } + // Merge when freed block exist at the end of another free block + auto start_it = free_start_offset_to_size_iter.find(freed_offset); + if (start_it != free_start_offset_to_size_iter.end()) { + auto size_to_start_offset_iter = start_it->second; + freed_size += size_to_start_offset_iter->first; + freed_offset -= size_to_start_offset_iter->first; + free_size_to_offset.erase(size_to_start_offset_iter); + free_start_offset_to_size_iter.erase(start_it); + } + allocated_offset_to_size.erase(freed_offset); + auto freed_block_it = + free_size_to_offset.emplace(freed_size, freed_offset).first; + free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it); + free_end_offset_to_size_iter.emplace( + freed_offset + freed_size, freed_block_it); + } + } + TORCH_CHECK(validate_allocation_plan(allocation_sizes, allocation_offsets), + "Allocation plan invaild."); + return allocation_offsets; +} + +} // namespace + +void AllocationPlan::clear() { + allocation_sizes.clear(); + allocation_lifetimes.clear(); + allocation_offsets.clear(); +} + +void AllocationPlanner::record_allocation( + const uint64_t size, const void* ptr) { + if (validation_mode_) { + validation_success = validation_success && validate_allocation(size, ptr); + return; + } + allocation_plan_->allocation_sizes.push_back(size); + allocation_plan_->allocation_lifetimes.push_back( + std::numeric_limits::max()); + allocation_ptr_to_id_.emplace(ptr, allocation_id_); + allocation_id_++; +} + +void AllocationPlanner::record_free(const void* ptr) { + if (validation_mode_) { + validation_success = validation_success && validate_free(ptr); + return; + } + auto it = allocation_ptr_to_id_.find(ptr); + if (it == allocation_ptr_to_id_.end()) { + // Free being recorded was allocated outside of WithProfileAllocationGuard + return; + } + auto id = it->second; + TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(), + "Allocation must have been recorded during record_allocation."); + allocation_plan_->allocation_lifetimes[id] = allocation_id_; +} + +bool AllocationPlanner::validate_allocation( + const uint64_t size, const void* ptr) { + if (allocation_id_ >= allocation_plan_->allocation_sizes.size() || + allocation_plan_->allocation_sizes[allocation_id_] != size) { + TORCH_WARN( + "Allocation request does not match plan:", + "Allocation id:", + allocation_id_, + ", Number of recorded allocations:", + allocation_plan_->allocation_sizes.size(), + ", Recorded size of the requested allocation:", + allocation_plan_->allocation_sizes[allocation_id_], + ", but got:", + size); + + return false; + } + allocation_ptr_to_id_.emplace(ptr, allocation_id_); + allocation_id_++; + return true; +} + +bool AllocationPlanner::validate_free(const void* ptr) { + auto it = allocation_ptr_to_id_.find(ptr); + if (it == allocation_ptr_to_id_.end()) { + // Allocation that was made outside the validation scope is being freed here + return true; + } + auto id = (*it).second; + TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(), + "Allocation must have been recorded during validate_allocation."); + auto lifetime_id = allocation_plan_->allocation_lifetimes[id]; + return (lifetime_id == allocation_id_); +} + +void AllocationPlanner::formulate_plan() { + allocation_plan_->allocation_offsets = + formulate_greedy_allocation_plan( + allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes); + allocation_plan_->total_size = 0; + for (auto i = 0; i < allocation_plan_->allocation_sizes.size(); ++i) { + if (allocation_plan_->allocation_lifetimes[i] == + std::numeric_limits::max()) { + continue; + } + auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i]; + allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit); + } +} + +void AllocationPlanner::clear() { + allocation_plan_->clear(); + allocation_ptr_to_id_.clear(); +} + +void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) { + TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr."); + plan_ = plan; + allocation_id_ = 0; + allocation_ptr_to_id_.clear(); + if (current_size_ < plan->total_size) { + // Free existing memory and reallocate for larger size. + c10::free_cpu(blob_); + blob_ = c10::alloc_cpu(plan->total_size); + current_size_ = plan->total_size; + } +} + +void CPUProfilingAllocator::unset_plan() { + allocation_id_ = 0; + allocation_ptr_to_id_.clear(); + plan_ = nullptr; +} + +void* CPUProfilingAllocator::allocate(const size_t bytes) { + TORCH_CHECK(bytes == plan_->allocation_sizes[allocation_id_], + "Got allocation request that does not match with the plan."); + if (plan_->allocation_lifetimes[allocation_id_] == + std::numeric_limits::max()) { + // This allocation is not managed by ProfilingAllocator. + allocation_id_++; + return c10::alloc_cpu(bytes); + } + void* ptr = + reinterpret_cast(blob_) + + plan_->allocation_offsets[allocation_id_]; + TORCH_CHECK(allocation_ptr_to_id_.emplace(ptr, allocation_id_).second); + allocation_id_++; + return ptr; +} + +void CPUProfilingAllocator::free(void* const ptr) { + auto it = allocation_ptr_to_id_.find(ptr); + if (it == allocation_ptr_to_id_.end()) { + // Either + // 1. Allocation that was made outside the validation scope is being freed here + // or + // 2. Allocation that is not managed by profiling allocator is being freed. + // Example of the second type + // Tensor out; + // for (....) { + // { + // CPUProfilingAllocator + // out = ...some op (This also frees previous memory held by out) + // } + // out is used.. + // } + c10::free_cpu(ptr); + return; + } + auto id = it->second; + TORCH_CHECK(id < plan_->allocation_lifetimes.size(), + "Freeing allocation that is not accordingly to the plan."); + auto lifetime_id = plan_->allocation_lifetimes[id]; + TORCH_CHECK( + lifetime_id == allocation_id_, + "Lifetime of allocations do not match: allocation_id ", + id, + ", expected:", + lifetime_id, + ", got:", + allocation_id_); +} + +CPUProfilingAllocator::~CPUProfilingAllocator() { + c10::free_cpu(blob_); +} + +WithProfileAllocationsGuard::WithProfileAllocationsGuard( + AllocationPlan* plan) { + // Nesting of allocation profiling does not seem meanigful. + TORCH_CHECK(allocation_planner == nullptr, + "Nesting profiling allocations is not supported."); + planner_ = std::make_unique(plan); + planner_->clear(); + allocation_planner = planner_.get(); +} + +WithProfileAllocationsGuard::~WithProfileAllocationsGuard() { + planner_->formulate_plan(); + allocation_planner = nullptr; +} + +WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard( + AllocationPlan* plan, bool* success) { + // Nesting of allocation profiling does not seem meanigful. + TORCH_CHECK(allocation_planner == nullptr, + "Nesting profiling allocations is not supported."); + planner_ = std::make_unique(plan, true); + success_ = success; + allocation_planner = planner_.get(); +} + +WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() { + *success_ = planner_->validation_success; + allocation_planner = nullptr; +} + +AllocationPlanner* GetThreadLocalAllocationPlanner() { + return allocation_planner; +} + +WithProfilingAllocatorGuard::WithProfilingAllocatorGuard( + CPUProfilingAllocator* allocator, const AllocationPlan* plan) { + // Nesting of profiling allocator is not supported. + TORCH_CHECK(profiling_allocator == nullptr, + "Nesting profiling allocators is not supported."); + profiling_allocator = allocator; + profiling_allocator->set_plan(plan); +} + +WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() { + profiling_allocator->unset_plan(); + profiling_allocator = nullptr; +} + +CPUProfilingAllocator* GetThreadLocalProfilingAllocator() { + return profiling_allocator; +} + +} // namespace c10 diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h new file mode 100644 index 000000000000..4a7e79fe2857 --- /dev/null +++ b/c10/mobile/CPUProfilingAllocator.h @@ -0,0 +1,149 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace c10 { + +/* + * Given a sequence of allocations in a thread, AllocationPlan records + * 1. size of each allocation + * 2. Lifetime of each allocation. + * 3. allocation offsets: Memory offset for each allocation in a single blob of memory + * 4. Total size of a blob of memory required to satisfy all the allocations. + */ +class C10_API AllocationPlan { + private: + // Records size of each allocation by their sequential allocation ids. + std::vector allocation_sizes; + // This maps one allocation id (X) to another allocation id (Y). + // Allocation X is alive until allocation Y. From allocation Y onwards + // allocation X is not referenced. + // Thus Y is the id of the first allocation after X is freed. + // NB: When an allocation is recorded, along with recording its size, + // we also set the lifetime to be numeric_limits::max() + // This is to track allocations that are made during the scope of + // profiling but were not freed until after the scope ended. + // Such allocations are not managed by profiling allocator. + std::vector allocation_lifetimes; + // Maps an allocation to some offset in a blob of memory. + std::vector allocation_offsets; + uint64_t total_size{0}; + void clear(); + friend class AllocationPlanner; + friend class CPUProfilingAllocator; +}; + +/* + * Map of memory ptr to allocation id. This is auxiliary information only + * used to establish lifetime of allocations. + */ +class C10_API AllocationPlanner { + private: + AllocationPlan* allocation_plan_{nullptr}; + // Maps allocated ptr to its allocation id. + // This is used when freeing the memory to lookup the allocation id + // in order to establish the lifetime of a particular allocation. + ska::flat_hash_map allocation_ptr_to_id_; + uint64_t allocation_id_{0}; + bool validation_mode_{false}; + + bool validate_allocation(const uint64_t size, const void* ptr); + bool validate_free(const void* ptr); + public: + bool validation_success{true}; + + AllocationPlanner() = delete; + AllocationPlanner(AllocationPlan* plan, bool validate = false) : + allocation_plan_(plan), validation_mode_(validate) {} + void record_allocation(const uint64_t size, const void* ptr); + void record_free(const void* ptr); + void formulate_plan(); + void clear(); +}; + +// NOT THREAD SAFE profiling allocator. +class C10_API CPUProfilingAllocator { + private: + const AllocationPlan* plan_{nullptr}; + uint64_t allocation_id_{0}; + uint64_t current_size_{0}; + void* blob_{nullptr}; + ska::flat_hash_map allocation_ptr_to_id_; + public: + ~CPUProfilingAllocator(); + void set_plan(const AllocationPlan* plan); + void unset_plan(); + void* allocate(const size_t bytes); + void free(void* const ptr); +}; + +/* + * Usage: Profile allocations made by one run of the model. + * AllocationPlan plan; + * { + * WithProfileAllocationGuard profile_guard(&plan); + * module.forward(...); + * } + * plan now contains allocation plan. + */ +class C10_API WithProfileAllocationsGuard { + public: + WithProfileAllocationsGuard(AllocationPlan* plan); + ~WithProfileAllocationsGuard(); + private: + std::unique_ptr planner_; +}; + +/* + * Usage: Validate allocation plan made with WithProfileAllocationGuard + * bool plan_validation_success, success = true; + * for (some number of representative inputs) + * { + * WithValidateAllocationPlanGuard(&plan, &plan_validation_success); + * module.forward(...); + * success = success && plan_validation_success; + * } + * success == true means allocations are according to plan + * else for some inputs allocation pattern changed. + */ +class C10_API WithValidateAllocationPlanGuard { + public: + WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success); + ~WithValidateAllocationPlanGuard(); + private: + std::unique_ptr planner_; + bool* success_; +}; + +AllocationPlanner* GetThreadLocalAllocationPlanner(); + +/* + * Usage: Allocate tensors accordingly to allocation plan + * First make allocation plan. + * See WithProfileAllocationsGuard usage. + * Second validate allocation plan. + * See WithValidateAllocationPlanGuard usage. + * CPUProfilingAllocator profiling_allocator; + * { + * WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan); + * module.forward(...); + * } + */ +class C10_API WithProfilingAllocatorGuard { + public: + WithProfilingAllocatorGuard( + CPUProfilingAllocator* allocator, const AllocationPlan* plan); + ~WithProfilingAllocatorGuard(); +}; + +CPUProfilingAllocator* GetThreadLocalProfilingAllocator(); + +} // namespace c10 diff --git a/caffe2/core/plan_executor.cc b/caffe2/core/plan_executor.cc index 3f70e96fffc8..c7c0200e5880 100644 --- a/caffe2/core/plan_executor.cc +++ b/caffe2/core/plan_executor.cc @@ -17,10 +17,18 @@ C10_DEFINE_bool( "If used we will handle exceptions in executor threads. " "This avoids SIGABRT but may cause process to deadlock"); +C10_DEFINE_int( + caffe2_plan_executor_exception_timeout, + 60, + "Number of seconds to wait for concurrent threads to stop on exception" + "before terminating."); + namespace caffe2 { namespace { +// ExceptionWrapper holds an exception. If exception pointers are being used, +// it'll hold the original exception pointer otherwise just the message. class ExceptionWrapper { public: ExceptionWrapper() : hasException_(false) {} @@ -39,6 +47,10 @@ class ExceptionWrapper { #endif } + const std::string& what() const { + return exceptionMsg_; + } + operator bool() { return hasException_; } @@ -51,6 +63,33 @@ class ExceptionWrapper { std::string exceptionMsg_; }; +// ExceptionWrapperTerminate terminates the program with the specified +// exception. This preserves the exception ptr and ExceptionTracer will +// correctly grab it on exit. +class ExceptionWrapperTerminate { + public: + explicit ExceptionWrapperTerminate(ExceptionWrapper&& ew) : ew_(std::move(ew)) {} + + ~ExceptionWrapperTerminate() { + ew_.rethrowException(); + } + + private: + ExceptionWrapper ew_; +}; + +// ScopeExitGuard runs the provided function when it's destructed. +class ScopeExitGuard { + public: + explicit ScopeExitGuard(std::function&& f) : f_(std::move(f)) {} + ~ScopeExitGuard() { + f_(); + } + + private: + std::function f_; +}; + struct NetDefInfo { const NetDef* netDef; // in order to keep the "override existing nets" on the top-level workflow, @@ -460,9 +499,16 @@ bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) { << " with " << step.substep().size() << " concurrent substeps"; std::atomic next_substep{0}; + std::condition_variable cv; + std::atomic done{0}; std::mutex exception_mutex; ExceptionWrapper first_exception; auto worker = [&]() { + ScopeExitGuard on_exit([&] { + done += 1; + cv.notify_all(); + }); + auto num_substeps = compiledStep->recurringSubsteps.size(); int substep_id = next_substep++ % num_substeps; if (compiledStep->gotFailure) { @@ -500,6 +546,23 @@ bool ExecuteStepRecursive(ExecutionStepWrapper& stepWrapper) { for (size_t i = 0; i < numThreads; ++i) { threads.emplace_back(worker); } + + auto workersDone = [&] { return done == numThreads; }; + + // If we get an exception, try to wait for all threads to stop + // gracefully. + std::unique_lock guard(exception_mutex); + cv.wait(guard, [&] { return workersDone() || first_exception; }); + cv.wait_for( + guard, + std::chrono::seconds(FLAGS_caffe2_plan_executor_exception_timeout), + [&] { return workersDone(); }); + if (!workersDone() && first_exception) { + LOG(ERROR) << "failed to stop concurrent workers after exception: " + << first_exception.what(); + ExceptionWrapperTerminate(std::move(first_exception)); + } + for (auto& thread : threads) { thread.join(); } diff --git a/caffe2/core/plan_executor_test.cc b/caffe2/core/plan_executor_test.cc index 86f145d72a09..1b0eb0e718a2 100644 --- a/caffe2/core/plan_executor_test.cc +++ b/caffe2/core/plan_executor_test.cc @@ -67,6 +67,29 @@ class ErrorOp final : public Operator { REGISTER_CPU_OPERATOR(Error, ErrorOp); OPERATOR_SCHEMA(Error).NumInputs(0).NumOutputs(0); +static std::atomic blockingErrorRuns{0}; +class BlockingErrorOp final : public Operator { + public: + BlockingErrorOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + + bool RunOnDevice() override { + // First n op executions should block and then start throwing errors. + if (blockingErrorRuns.fetch_sub(1) >= 1) { + LOG(INFO) << "blocking"; + while (true) { + std::this_thread::sleep_for(std::chrono::hours(10)); + } + } else { + LOG(INFO) << "throwing"; + throw TestError(); + } + } +}; + +REGISTER_CPU_OPERATOR(BlockingError, BlockingErrorOp); +OPERATOR_SCHEMA(BlockingError).NumInputs(0).NumOutputs(0); + PlanDef parallelErrorPlan() { PlanDef plan_def; @@ -101,10 +124,12 @@ PlanDef parallelErrorPlan() { } struct HandleExecutorThreadExceptionsGuard { - HandleExecutorThreadExceptionsGuard() { + HandleExecutorThreadExceptionsGuard(int timeout = 60) { globalInit({ "caffe2", "--caffe2_handle_executor_threads_exceptions=1", + "--caffe2_plan_executor_exception_timeout=" + + caffe2::to_string(timeout), }); } @@ -139,6 +164,38 @@ TEST(PlanExecutorTest, ErrorAsyncPlan) { ASSERT_EQ(cancelCount, 1); } +TEST(PlanExecutorTest, BlockingErrorPlan) { + ASSERT_DEATH( + [] { + HandleExecutorThreadExceptionsGuard guard(/*timeout=*/1); + + PlanDef plan_def; + + std::string plan_def_template = R"DOC( + network { + name: "net" + op { + type: "BlockingError" + } + } + execution_step { + num_concurrent_instances: 2 + substep { + network: "net" + } + } + )DOC"; + + CAFFE_ENFORCE( + TextFormat::ParseFromString(plan_def_template, &plan_def)); + Workspace ws; + blockingErrorRuns = 1; + ws.RunPlan(plan_def); + FAIL() << "shouldn't have reached this point"; + }(), + "failed to stop concurrent workers after exception: test error"); +} + } // namespace caffe2 #endif diff --git a/caffe2/operators/slice_op.cc b/caffe2/operators/slice_op.cc index 7acf854ba9da..f9fd39303261 100644 --- a/caffe2/operators/slice_op.cc +++ b/caffe2/operators/slice_op.cc @@ -17,7 +17,7 @@ Produces a slice of the input tensor. - Start and end indices are either passed as two 1D input tensors or using the `starts` and `ends` arguments. -- If a negative value is passed for any of the start or end indices, it represents the number of elements before the end of that dimension. End indices are non-inclusive unless negative (end index -1 means up to and including the last element). +- If a negative value is passed for any of the start or end indices, it represents |value| - 1 elements before the end of that dimension. End indices are non-inclusive unless negative (end index -1 means up to and including the last element). Github Links: - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/slice_op.cc @@ -67,11 +67,11 @@ print("Y:", workspace.FetchBlob("Y")) .Input( 1, "starts", - "(*Tensor``*): 1D tensor of start-indices for each dimension of data") + "(*Tensor``*): 1D tensor of start-indices for each dimension of data (dimensions following the sliced one might be omitted)") .Input( 2, "ends", - "(*Tensor``*): 1D tensor of end-indices for each dimension of data") + "(*Tensor``*): 1D tensor of end-indices for each dimension of data (dimensions following the sliced one might be omitted)") .Arg("starts", "(*Tuple(int)*): list of starting indices") .Arg("ends", "(*Tuple(int)*): list of ending indices") .TensorInferenceFunction([](const OperatorDef& def, @@ -90,9 +90,10 @@ print("Y:", workspace.FetchBlob("Y")) for (int i = 0; i < data.dims_size(); ++i) { if (i >= starts.size()) { + dst_sizes[i] = data.dims(i); continue; } - if (data.dims_size() > 0) { + if (data.dims(i) > 0) { auto start = starts[i]; auto end = ends[i]; if (start < 0) { diff --git a/caffe2/operators/slice_op.cu b/caffe2/operators/slice_op.cu index 7a843fee3a52..184385310c9c 100644 --- a/caffe2/operators/slice_op.cu +++ b/caffe2/operators/slice_op.cu @@ -74,22 +74,23 @@ bool SliceImplGpu( if (i >= starts.numel()) { starts_idx[i] = 0; ends_idx[i] = data.size(i); + dst_sizes[i] = data.size(i); continue; } if (data.size(i) > 0) { auto start = starts_data[i]; auto end = ends_data[i]; if (start < 0) { - start = data.sizes()[i] + 1 + start; + start = data.size(i) + 1 + start; } if (end < 0) { - end = data.sizes()[i] + 1 + end; + end = data.size(i) + 1 + end; } - if (start > data.sizes()[i]) { - start = data.sizes()[i]; + if (start > data.size(i)) { + start = data.size(i); } - if (end > data.sizes()[i]) { - end = data.sizes()[i]; + if (end > data.size(i)) { + end = data.size(i); } CAFFE_ENFORCE_GE(start, 0); CAFFE_ENFORCE_GE(end, 0); @@ -115,7 +116,7 @@ bool SliceImplGpu( // for now only supports slicing in 1 dimension int dim = -1; for (int i = 0; i < data.dim(); ++i) { - if (starts_idx[i] > 0 || ends_idx[i] < data.sizes()[i]) { + if (starts_idx[i] > 0 || ends_idx[i] < data.size(i)) { CAFFE_ENFORCE_EQ( dim, -1, "Currently only possible to slice in 1 dimension."); dim = i; @@ -154,7 +155,7 @@ bool SliceImplGpu( size_t src_nbytes = data.nbytes(); size_t dst_nbytes = output->nbytes(); - size_t src_block_size = unit * data.sizes()[dim]; + size_t src_block_size = unit * data.size(dim); size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]); size_t src_offset = unit * starts_idx[dim]; @@ -187,7 +188,7 @@ bool SliceImplGpu( size_t dst_nbytes = gdata->nbytes(); size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]); - size_t dst_block_size = unit * data.sizes()[dim]; + size_t dst_block_size = unit * data.size(dim); size_t dst_offset = unit * starts_idx[dim]; if (num_blocks == 0 || dst_block_size == 0) { diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h index 8d1990e54c38..9706472315b6 100644 --- a/caffe2/operators/slice_op.h +++ b/caffe2/operators/slice_op.h @@ -33,23 +33,24 @@ bool SliceImpl( for (int i = 0; i < data.dim(); ++i) { if (i >= starts.numel()) { starts_idx[i] = 0; - ends_idx[i] = data.sizes()[i]; + ends_idx[i] = data.size(i); + dst_sizes[i] = data.size(i); continue; } - if (data.sizes()[i] > 0) { + if (data.size(i) > 0) { auto start = starts_data[i]; auto end = ends_data[i]; if (start < 0) { - start = data.sizes()[i] + 1 + start; + start = data.size(i) + 1 + start; } if (end < 0) { - end = data.sizes()[i] + 1 + end; + end = data.size(i) + 1 + end; } - if (start > data.sizes()[i]) { - start = data.sizes()[i]; + if (start > data.size(i)) { + start = data.size(i); } - if (end > data.sizes()[i]) { - end = data.sizes()[i]; + if (end > data.size(i)) { + end = data.size(i); } CAFFE_ENFORCE_GE(start, 0); CAFFE_ENFORCE_GE(end, 0); @@ -78,7 +79,7 @@ bool SliceImpl( // for now only supports slicing in 1 dimension int dim = -1; for (int i = 0; i < data.dim(); ++i) { - if (starts_idx[i] > 0 || ends_idx[i] < data.sizes()[i]) { + if (starts_idx[i] > 0 || ends_idx[i] < data.size(i)) { CAFFE_ENFORCE_EQ( dim, -1, "Currently only possible to slice in 1 dimension."); dim = i; @@ -117,7 +118,7 @@ bool SliceImpl( size_t src_nbytes = data.nbytes(); size_t dst_nbytes = output->nbytes(); - size_t src_block_size = unit * data.sizes()[dim]; + size_t src_block_size = unit * data.size(dim); size_t dst_block_size = unit * (ends_idx[dim] - starts_idx[dim]); size_t src_offset = unit * starts_idx[dim]; @@ -155,7 +156,7 @@ bool SliceImpl( size_t dst_nbytes = gdata->nbytes(); size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]); - size_t dst_block_size = unit * data.sizes()[dim]; + size_t dst_block_size = unit * data.size(dim); size_t dst_offset = unit * starts_idx[dim]; if (num_blocks == 0 || dst_block_size == 0) { diff --git a/caffe2/operators/string_ops.cc b/caffe2/operators/string_ops.cc index 76fedeb488f8..7339d772f473 100644 --- a/caffe2/operators/string_ops.cc +++ b/caffe2/operators/string_ops.cc @@ -71,6 +71,17 @@ struct EndsWith { std::string suffix_; }; +struct StrEquals { + explicit StrEquals(OperatorBase& op) + : text_(op.GetSingleArgument("text", "")) {} + bool operator()(const std::string& str) { + return str == text_; + } + + private: + std::string text_; +}; + struct Prefix { explicit Prefix(OperatorBase& op) : length_(op.GetSingleArgument("length", 3)) {} @@ -108,6 +119,9 @@ REGISTER_CPU_OPERATOR( REGISTER_CPU_OPERATOR( StringEndsWith, StringElementwiseOp>); +REGISTER_CPU_OPERATOR( + StringEquals, + StringElementwiseOp>); REGISTER_CPU_OPERATOR(StringJoin, StringJoinOp); OPERATOR_SCHEMA(StringPrefix) @@ -164,6 +178,17 @@ Returns tensor of boolean of the same dimension of input. .Input(0, "strings", "Tensor of std::string.") .Output(0, "bools", "Tensor of bools of same shape as input."); +OPERATOR_SCHEMA(StringEquals) + .NumInputs(1) + .NumOutputs(1) + .SetDoc(R"DOC( +Performs equality check on each string in the input tensor. +Returns tensor of booleans of the same dimension as input. +)DOC") + .Arg("text", "The text to check input strings equality against.") + .Input(0, "strings", "Tensor of std::string.") + .Output(0, "bools", "Tensor of bools of same shape as input."); + OPERATOR_SCHEMA(StringJoin) .NumInputs(1) .NumOutputs(1) @@ -187,6 +212,7 @@ SHOULD_NOT_DO_GRADIENT(StringPrefix); SHOULD_NOT_DO_GRADIENT(StringSuffix); SHOULD_NOT_DO_GRADIENT(StringStartsWith); SHOULD_NOT_DO_GRADIENT(StringEndsWith); +SHOULD_NOT_DO_GRADIENT(StringEquals); SHOULD_NOT_DO_GRADIENT(StringJoin); } } // namespace caffe2 diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py index eedb57be1d6c..a0c56a686666 100644 --- a/caffe2/python/operator_test/string_ops_test.py +++ b/caffe2/python/operator_test/string_ops_test.py @@ -119,6 +119,33 @@ def string_ends_with_ref(strings): [strings], string_ends_with_ref) + @given(strings=st.text(alphabet=['a', 'b'])) + @settings(deadline=1000) + def test_string_equals(self, strings): + text = "" + if strings: + text = strings[0] + + strings = np.array( + [str(a) for a in strings], dtype=np.object + ) + + def string_equals_ref(strings): + return ( + np.array([a == text for a in strings], dtype=bool), + ) + + op = core.CreateOperator( + 'StringEquals', + ['strings'], + ['bools'], + text=text) + self.assertReferenceChecks( + hu.cpu_do, + op, + [strings], + string_equals_ref) + if __name__ == "__main__": import unittest unittest.main() diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake index 8b60915f7e00..c9ac37783d1c 100644 --- a/cmake/public/cuda.cmake +++ b/cmake/public/cuda.cmake @@ -478,7 +478,7 @@ foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration endforeach() # Set C++14 support -set(CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Werror") +set(CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Werror") if(MSVC) list(APPEND CUDA_NVCC_FLAGS "--Werror" "cross-execution-space-call") list(APPEND CUDA_NVCC_FLAGS "--no-host-device-move-forward") @@ -490,7 +490,7 @@ endif() # OpenMP flags for NVCC with Clang-cl if("${CMAKE_CXX_SIMULATE_ID}" STREQUAL "MSVC" AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLACKLIST "-Xclang" "-fopenmp") + list(APPEND CUDA_PROPAGATE_HOST_FLAGS_BLOCKLIST "-Xclang" "-fopenmp") if(MSVC_TOOLSET_VERSION LESS 142) list(APPEND CUDA_NVCC_FLAGS "-Xcompiler" "-openmp") else() diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst index c152ca616571..f346fbe994e6 100644 --- a/docs/source/community/persons_of_interest.rst +++ b/docs/source/community/persons_of_interest.rst @@ -25,7 +25,6 @@ torch.* torch.nn ~~~~~~~~ -- Thomas Viehmann (`t-vi `__) - Adam Paszke (`apaszke `__) - Greg Chanan (`gchanan `__) - Soumith Chintala (`soumith `__) diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index b597fa9f51f3..b78ed2c08586 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -77,6 +77,261 @@ The corresponding implementation is chosen automatically based on the PyTorch bu ``torch.backends.quantized.engine = 'qnnpack'`` +Quantization API Summary +--------------------------------------- + +There are three types of quantization supported in PyTorch: + +1. dynamic quantization (weights quantized with activations read/stored in + floating point and quantized for compute.) +2. static quantization (weights quantized, activations quantized, calibration + required post training) +3. quantization aware training (weights quantized, activations quantized, + quantization numerics modeled during training) + +Please see our `Introduction to Quantization on Pytorch +`_ blog post +for a more comprehensive overview of the tradeoffs between these quantization +types. + +Dynamic Quantization +^^^^^^^^^^^^^^^^^^^^ + +This is the simplest to apply form of quantization where the weights are +quantized ahead of time but the activations are dynamically quantized +during inference. This is used for situations where the model execution time +is dominated by loading weights from memory rather than computing the matrix +multiplications. This is true for for LSTM and Transformer type models with +small batch size. + +Diagram:: + + # original model + # all tensors and computations are in floating point + previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32 + / + linear_weight_fp32 + + # dynamically quantized model + # linear and conv weights are in int8 + previous_layer_fp32 -- linear_int8_w_fp32_inp -- activation_fp32 -- next_layer_fp32 + / + linear_weight_int8 + +API example:: + + import torch + + # define a floating point model + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.fc = torch.nn.Linear(4, 4) + + def forward(self, x): + x = self.fc(x) + return x + + # create a model instance + model_fp32 = M() + # create a quantized model instance + model_int8 = torch.quantization.quantize_dynamic( + model_fp32, # the original model + {torch.nn.Linear}, # a set of layers to dynamically quantize + dtype=torch.qint8) # the target dtype for quantized weights + + # run the model + input_fp32 = torch.randn(4, 4, 4, 4) + res = model_int8(input_fp32) + +To learn more about dynamic quantization please see our `dynamic quantization tutorial +`_. + +Static Quantization +^^^^^^^^^^^^^^^^^^^^ + +Static quantization quantizes the weights and activations of the model. It +fuses activations into preceding layers where possible. It requires +calibration with a representative dataset to determine optimal quantization +parameters for activations. Post Training Quantization is typically used when +both memory bandwidth and compute savings are important with CNNs being a +typical use case. Static quantization is also known as Post Training +Quantization or PTQ. + +Diagram:: + + # original model + # all tensors and computations are in floating point + previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32 + / + linear_weight_fp32 + + # statically quantized model + # weights and activations are in int8 + previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8 + / + linear_weight_int8 + +API Example:: + + import torch + + # define a floating point model where some layers could be statically quantized + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + # QuantStub converts tensors from floating point to quantized + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.relu = torch.nn.ReLU() + # DeQuantStub converts tensors from quantized to floating point + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + # manually specify where tensors will be converted from floating + # point to quantized in the quantized model + x = self.quant(x) + x = self.conv(x) + x = self.relu(x) + # manually specify where tensors will be converted from quantized + # to floating point in the quantized model + x = self.dequant(x) + return x + + # create a model instance + model_fp32 = M() + + # model must be set to eval mode for static quantization logic to work + model_fp32.eval() + + # attach a global qconfig, which contains information about what kind + # of observers to attach. Use 'fbgemm' for server inference and + # 'qnnpack' for mobile inference. Other quantization configurations such + # as selecting symmetric or assymetric quantization and MinMax or L2Norm + # calibration techniques can be specified here. + model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm') + + # Fuse the activations to preceding layers, where applicable. + # This needs to be done manually depending on the model architecture. + # Common fusions include `conv + relu` and `conv + batchnorm + relu` + model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']]) + + # Prepare the model for static quantization. This inserts observers in + # the model that will observe activation tensors during calibration. + model_fp32_prepared = torch.quantization.prepare(model_fp32_fused) + + # calibrate the prepared model to determine quantization parameters for activations + # in a real world setting, the calibration would be done with a representative dataset + input_fp32 = torch.randn(4, 1, 4, 4) + model_fp32_prepared(input_fp32) + + # Convert the observed model to a quantized model. This does several things: + # quantizes the weights, computes and stores the scale and bias value to be + # used with each activation tensor, and replaces key operators with quantized + # implementations. + model_int8 = torch.quantization.convert(model_fp32_prepared) + + # run the model, relevant calculations will happen in int8 + res = model_int8(input_fp32) + +To learn more about static quantization, please see the `static quantization tutorial +`_. + +Quantization Aware Training +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Quantization Aware Training models the effects of quantization during training +allowing for higher accuracy compared to other quantization methods. During +training, all calculations are done in floating point, with fake_quant modules +modeling the effects of quantization by clamping and rounding to simulate the +effects of INT8. After model conversion, weights and +activations are quantized, and activations are fused into the preceding layer +where possible. It is commonly used with CNNs and yields a higher accuracy +compared to static quantization. Quantization Aware Training is also known as +QAT. + +Diagram:: + + # original model + # all tensors and computations are in floating point + previous_layer_fp32 -- linear_fp32 -- activation_fp32 -- next_layer_fp32 + / + linear_weight_fp32 + + # model with fake_quants for modeling quantization numerics during training + previous_layer_fp32 -- fq -- linear_fp32 -- activation_fp32 -- fq -- next_layer_fp32 + / + linear_weight_fp32 -- fq + + # quantized model + # weights and activations are in int8 + previous_layer_int8 -- linear_with_activation_int8 -- next_layer_int8 + / + linear_weight_int8 + +API Example:: + + import torch + + # define a floating point model where some layers could benefit from QAT + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + # QuantStub converts tensors from floating point to quantized + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + self.bn = torch.nn.BatchNorm2d(1) + self.relu = torch.nn.ReLU() + # DeQuantStub converts tensors from quantized to floating point + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + x = self.dequant(x) + return x + + # create a model instance + model_fp32 = M() + + # model must be set to train mode for QAT logic to work + model_fp32.train() + + # attach a global qconfig, which contains information about what kind + # of observers to attach. Use 'fbgemm' for server inference and + # 'qnnpack' for mobile inference. Other quantization configurations such + # as selecting symmetric or assymetric quantization and MinMax or L2Norm + # calibration techniques can be specified here. + model_fp32.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm') + + # fuse the activations to preceding layers, where applicable + # this needs to be done manually depending on the model architecture + model_fp32_fused = torch.quantization.fuse_modules(model_fp32, + [['conv', 'bn', 'relu']]) + + # Prepare the model for QAT. This inserts observers and fake_quants in + # the model that will observe weight and activation tensors during calibration. + model_fp32_prepared = torch.quantization.prepare_qat(model_fp32_fused) + + # run the training loop (not shown) + training_loop(model_fp32_prepared) + + # Convert the observed model to a quantized model. This does several things: + # quantizes the weights, computes and stores the scale and bias value to be + # used with each activation tensor, fuses modules where appropriate, + # and replaces key operators with quantized implementations. + model_fp32_prepared.eval() + model_int8 = torch.quantization.convert(model_fp32_prepared) + + # run the model, relevant calculations will happen in int8 + res = model_int8(input_fp32) + +To learn more about quantization aware training, please see the `QAT +tutorial +`_. + Quantized Tensors --------------------------------------- @@ -121,79 +376,8 @@ cover typical CNN and RNN models torch.nn.quantized torch.nn.quantized.dynamic -Quantization Workflows ----------------------- - -PyTorch provides three approaches to quantize models. - -.. _quantization tutorials: - https://pytorch.org/tutorials/#quantization-experimental - -1. Post Training Dynamic Quantization: This is the simplest to apply form of - quantization where the weights are quantized ahead of time but the - activations are dynamically quantized during inference. This is used - for situations where the model execution time is dominated by loading - weights from memory rather than computing the matrix multiplications. - This is true for for LSTM and Transformer type models with small - batch size. Applying dynamic quantization to a whole model can be - done with a single call to :func:`torch.quantization.quantize_dynamic()`. - See the `quantization tutorials`_ -2. Post Training Static Quantization: This is the most commonly used form of - quantization where the weights are quantized ahead of time and the - scale factor and bias for the activation tensors is pre-computed - based on observing the behavior of the model during a calibration - process. Post Training Quantization is typically when both memory bandwidth - and compute savings are important with CNNs being a typical use case. - The general process for doing post training quantization is: - - - - 1. Prepare the model: - - a. Specify where the activations are quantized and dequantized explicitly - by adding QuantStub and DeQuantStub modules. - b. Ensure that modules are not reused. - c. Convert any operations that require requantization into modules - - 2. Fuse operations like conv + relu or conv+batchnorm + relu together to - improve both model accuracy and performance. - - 3. Specify the configuration of the quantization methods \'97 such as - selecting symmetric or asymmetric quantization and MinMax or - L2Norm calibration techniques. - 4. Use the :func:`torch.quantization.prepare` to insert modules - that will observe activation tensors during calibration - 5. Calibrate the model by running inference against a calibration - dataset - 6. Finally, convert the model itself with the - torch.quantization.convert() method. This does several things: it - quantizes the weights, computes and stores the scale and bias - value to be used each activation tensor, and replaces key - operators quantized implementations. - - See the `quantization tutorials`_ - - -3. Quantization Aware Training: In the rare cases where post training - quantization does not provide adequate accuracy training can be done - with simulated quantization using the - :class:`torch.quantization.FakeQuantize`. Computations will take place in - FP32 but with values clamped and rounded to simulate the effects of INT8 - quantization. The sequence of steps is very similar. - - - 1. Steps (1) and (2) are identical. - - 3. Specify the configuration of the fake quantization methods \'97 such as - selecting symmetric or asymmetric quantization and MinMax or Moving Average - or L2Norm calibration techniques. - 4. Use the :func:`torch.quantization.prepare_qat` to insert modules - that will simulate quantization during training. - 5. Train or fine tune the model. - 6. Identical to step (6) for post training quantization - - See the `quantization tutorials`_ - +Quantization Customizations +--------------------------- While default implementations of observers to select the scale factor and bias based on observed tensor data are provided, developers can provide their own @@ -218,9 +402,15 @@ prior to quantization. This is because currently quantization works on a module by module basis. Specifically, for all quantization techniques, the user needs to: 1. Convert any operations that require output requantization (and thus have - additional parameters) from functionals to module form. + additional parameters) from functionals to module form (for example, + using ``torch.nn.ReLU`` instead of ``torch.nn.functional.relu``). 2. Specify which parts of the model need to be quantized either by assigning - ```.qconfig`` attributes on submodules or by specifying ``qconfig_dict`` + ``.qconfig`` attributes on submodules or by specifying ``qconfig_dict``. + For example, setting ``model.conv1.qconfig = None`` means that the + ``model.conv`` layer will not be quantized, and setting + ``model.linear1.qconfig = custom_qconfig`` means that the quantization + settings for ``model.linear1`` will be using ``custom_qconfig`` instead + of the global qconfig. For static quantization techniques which quantize activations, the user needs to do the following in addition: @@ -238,6 +428,13 @@ to do the following in addition: to be fused. We currently support the following fusions: [Conv, Relu], [Conv, BatchNorm], [Conv, BatchNorm, Relu], [Linear, Relu] +Best Practices +-------------- + +1. Set the ``reduce_range`` argument on observers to `True` if you are using the + ``fbgemm`` backend. This argument prevents overflow on some int8 instructions + by reducing the range of quantized data type by 1 bit. + Modules that provide quantization functions and classes ------------------------------------------------------- diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec index 17e9fb26afa1..f74e2dc9f37e 100644 --- a/ios/LibTorch.podspec +++ b/ios/LibTorch.podspec @@ -1,6 +1,6 @@ Pod::Spec.new do |s| s.name = 'LibTorch' - s.version = '1.6.0' + s.version = '1.6.1' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/pytorch' diff --git a/setup.py b/setup.py index b2270db497cf..c29ee929b8ca 100644 --- a/setup.py +++ b/setup.py @@ -346,7 +346,7 @@ def check_file(f): install_requires = [ 'future', 'typing_extensions', - 'dataclasses; python_version < "3.8"' + 'dataclasses; python_version < "3.7"' ] missing_pydep = ''' diff --git a/test/cpp/rpc/test_e2e_process_group.cpp b/test/cpp/rpc/test_e2e_process_group.cpp index d509a4606fa1..7c5af57d6a09 100644 --- a/test/cpp/rpc/test_e2e_process_group.cpp +++ b/test/cpp/rpc/test_e2e_process_group.cpp @@ -19,6 +19,7 @@ class TestE2EProcessGroup : public TestE2EBase { options.devices.push_back( ::c10d::ProcessGroupGloo::createDeviceForHostname(serverAddress)); std::chrono::milliseconds rpcTimeout(30000); + options.timeout = rpcTimeout; // Initialize server rpc agent. auto pg = diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 53a118e8f15b..9d0c19bef7b3 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -29,7 +29,8 @@ from torch.testing._internal.common_distributed import MultiProcessTestCase, \ requires_gloo, requires_nccl, requires_nccl_version, \ skip_if_not_multigpu, skip_if_lt_x_gpu, get_timeout, skip_if_rocm, \ - simple_sparse_reduce_tests, skip_if_win32, create_device + skip_if_rocm_single_process, simple_sparse_reduce_tests, skip_if_win32, \ + create_device from torch.testing._internal.common_utils import TestCase, load_tests, run_tests, \ retry_on_connect_failures, ADDRESS_IN_USE, CONNECT_TIMEOUT, TEST_WITH_TSAN @@ -1594,13 +1595,30 @@ def create(num, prefix): self.assertEqual(torch.full([10, 10], float(self.world_size)), tensor) del pg +class ProcessGroupNCCLNoGPUTest(TestCase): + MAIN_PROCESS_RANK = 0 + + def setUp(self): + self.rank = self.MAIN_PROCESS_RANK + self.world_size = 1 + self.file = tempfile.NamedTemporaryFile(delete=False) + self.num_gpus = torch.cuda.device_count() + if self.num_gpus > 0: + raise unittest.SkipTest("GPUs are available, skipping test") + + def tearDown(self): + pass + + @requires_nccl() + @skip_if_rocm_single_process + def test_init_no_gpus(self): + store = c10d.FileStore(self.file.name, self.world_size) + with self.assertRaisesRegex( + RuntimeError, + "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"): + c10d.ProcessGroupNCCL(store, self.rank, self.world_size) + -@requires_nccl() -@unittest.skipIf( - TEST_WITH_TSAN, - "TSAN is not fork-safe since we're forking in a multi-threaded environment", -) -@skip_if_rocm class ProcessGroupNCCLTest(TestCase): MAIN_PROCESS_RANK = 0 @@ -1615,6 +1633,8 @@ def setUp(self): def tearDown(self): pass + @requires_nccl() + @skip_if_rocm_single_process def test_empty_tensors(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1639,6 +1659,8 @@ def test_empty_tensors(self): pg.reduce_scatter(ys, xs).wait() self.assertEqual(0, ys[0].numel()) + @requires_nccl() + @skip_if_rocm_single_process def test_broadcast_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1661,6 +1683,8 @@ def broadcast(xs, rootRank, rootTensor): for i in range(self.num_gpus): self.assertEqual(tensors[i], tensors[rt]) + @requires_nccl() + @skip_if_rocm_single_process def test_allreduce_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1722,6 +1746,8 @@ def allreduce(tensors, op): with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"): allreduce(tensors, op) + @requires_nccl() + @skip_if_rocm_single_process def test_reduce_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1752,6 +1778,8 @@ def reduce(xs, rootRank, rootTensor, op=None): with self.assertRaisesRegex(RuntimeError, "Cannot use " + str(op) + " with NCCL"): reduce(tensors, self.rank, rt, op) + @requires_nccl() + @skip_if_rocm_single_process def test_allgather_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1777,6 +1805,8 @@ def allgather(output_ts, input_ts): for s_idx, t in enumerate(device_ts): self.assertEqual(torch.tensor([s_idx]), t) + @requires_nccl() + @skip_if_rocm_single_process def test_reduce_scatter_ops(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -1854,6 +1884,8 @@ def reduce_scatter(outputs, input_lists, op): # TODO(#38095): Replace assertEqualIgnoreType. See issue #38095 self.assertEqualIgnoreType(expected, output[i]) + @requires_nccl() + @skip_if_rocm_single_process def test_barrier(self): store = c10d.FileStore(self.file.name, self.world_size) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) @@ -2140,6 +2172,7 @@ def test_gloo_backend_1gpu_module_device_ids_torch_device_list(self): @requires_gloo() @skip_if_lt_x_gpu(4) + @skip_if_rocm def test_gloo_backend_2gpu_module(self): int_devices = gpus_for_rank(self.world_size)[self.rank][:2] devices = [torch.device("cuda:" + str(i)) for i in int_devices] diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py index 8d0f74349b3b..19e4952cad57 100644 --- a/test/jit/test_list_dict.py +++ b/test/jit/test_list_dict.py @@ -408,6 +408,43 @@ def test_over_slice(): return a[3:10] == [3, 4] self.checkScript(test_backward_slice, ()) + def test_slice_index(self): + a = torch.tensor( + [ + [[1, 11], [2, 22]], + [[3, 33], [4, 44]], + [[5, 55], [6, 66]], + ] + ) + + def test_index_slice1(x): + x = x[:, :, [0, 1]] + return x + self.checkScript(test_index_slice1, (a,)) + + def test_index_slice2(x): + x = x[[2, 1, 0], :, :] + return x + self.checkScript(test_index_slice2, (a,)) + + def test_index_slice3(x): + x = x[[0, 1], :, [1]] + return x + self.checkScript(test_index_slice3, (a,)) + + def test_index_slice_empty_list(x): + empty_list: List[int] = [] + x = x[empty_list, :, :] + return x + self.checkScript(test_index_slice_empty_list, (a,)) + + def test_index_slice_out_of_bounds_index(x): + x = x[[4], :, :] + return x + with self.assertRaisesRegex(RuntimeError, "index 4 is out of bounds for dimension 0 with size 3"): + self.checkScript(test_index_slice_out_of_bounds_index, (a,)) + + def test_mutable_list_append(self): def test_append(): a = [0, 1] diff --git a/test/run_test.py b/test/run_test.py index 9ce938f6cd7b..2af7405e300b 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -200,6 +200,15 @@ PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE")) +JIT_EXECUTOR_TESTS = [ + 'test_jit_cuda_fuser_profiling', + 'test_jit_cuda_fuser_legacy', + 'test_jit_profiling', + 'test_jit_legacy', + 'test_jit_fuser_legacy', + 'test_jit_fuser_te', + 'test_tensorexpr'] + def print_to_stderr(message): print(message, file=sys.stderr) @@ -456,7 +465,12 @@ def parse_args(): type=int, help='runs a shard of the tests (taking into account other selections), e.g., ' '--shard 2 3 will break up the selected tests into 3 shards and run the tests ' - 'in the 2nd shard (the number of shards will be whichever argument is greater)', + 'in the 2nd shard (the first number should not exceed the second)', + ) + parser.add_argument( + '--exclude-jit-executor', + action='store_true', + help='exclude tests that are run for a specific jit config' ) return parser.parse_args() @@ -533,6 +547,9 @@ def get_selected_tests(options): assert num_shards <= len(selected_tests), f"Number of shards must be less than {len(selected_tests)}" selected_tests = selected_tests[which_shard - 1 :: num_shards] + if options.exclude_jit_executor: + options.exclude.extend(JIT_EXECUTOR_TESTS) + selected_tests = exclude_tests(options.exclude, selected_tests) if sys.platform == 'win32' and not options.ignore_win_blocklist: diff --git a/test/test_autograd.py b/test/test_autograd.py index e92fbcbf21bb..6bd6925e015f 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -6139,6 +6139,18 @@ def _test_euclidean_large_cdist(sizex, sizey=None): _test_cdist_for_size((1, 1), (S, 1)) _test_euclidean_large_cdist((2000, 5)) + # Ensure that cdist backward with p<1 does not produce NaNs + def test_cdist_grad_p_lt_1_no_nan(self, device): + for p in [0.99, 0.7, 0.5, 0.1, 0.01]: + x = torch.randn(1, 2, device=device) + y = x.clone().detach() + torch.tensor([[1., 0.]], device=device) + x.requires_grad = True + y.requires_grad = True + result = torch.cdist(x, y, p=p) + result.backward(torch.ones_like(result)) + self.assertFalse(torch.isnan(x.grad).any()) + self.assertFalse(torch.isnan(y.grad).any()) + def test_cdist_same_inputs(self, device): # Test to detect issues in cdist gradient calculation # When the distances are 0 diff --git a/test/test_function_schema.py b/test/test_function_schema.py index f2ad2290d326..5a1527373478 100644 --- a/test/test_function_schema.py +++ b/test/test_function_schema.py @@ -14,90 +14,77 @@ def test_serialize_and_deserialize(self): self.assertEqual(parsed_schema, schema) self.assertTrue(parsed_schema.is_backward_compatible_with(schema)) - def test_backward_compatible_args(self): - old_schema = parse_schema('any(Tensor self, int dim) -> Tensor') - new_schema = parse_schema('any(Tensor self, int? dim) -> Tensor') + def test_backward_compatible_structure(self): + old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + # BC: A new schema without changes. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim=5) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_compatible_kwargs(self): - old_schema = parse_schema('any(Tensor self, *, Tensor out) -> Tensor') - new_schema = parse_schema('any(Tensor self, *, bool extra1=True, Tensor out, bool extra2=False) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, Tensor out) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_compatible_ret(self): - old_schema = parse_schema('any(Tensor self) -> Tensor?') - new_schema = parse_schema('any(Tensor self) -> Tensor') - self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_name(self): - old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any_(Tensor self, int dim, bool keepdim=False) -> Tensor') + # No-BC: A new schema with different name. + new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_vararg(self): - old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False, ...) -> Tensor') + # No-BC: A new schema with different overload name. + new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_returns(self): - old_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, ...)') + # No-BC: A new schema that adds vararg. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> int') + # No-BC: A new schema with different number of outputs. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor?') + + def test_backward_compatible_outputs(self): + old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor') + # No-BC: A new schema with output becoming of optional type. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + # BC: (the opposite case) An schema where the output is not of optional type anymore. self.assertTrue(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)') + # No-BC: A new schema with a different output type. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dim, bool keepdim=False) -> Tensor out') + # No-BC: A new schema with a different output type. + new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - def test_backward_incompatible_args(self): - old_schema = parse_schema('any(Tensor self, int[] dims, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor s, int[] dims, bool keepdim=False) -> Tensor') + def test_backward_compatible_arguments(self): + old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor') + # No-BC: A new schema with less arguments. + new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[3] dims, bool keepdim=False) -> Tensor') + # No-BC: A new schema with more arguments, appended, but no default value. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[](a) dims, bool keepdim=False) -> Tensor') + # BC: A new schema with more arguments, appended, that have a default value. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # No-BC: A new schema with more arguments, not-appended, that have a default value. + new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + # BC: A new schema where old kwargs becomes positional. + new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # BC: (the opposite case) A new schema where an old positional argument becomes kwarg. self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int dims, bool keepdim=False) -> Tensor') - self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) + # BC: A new schema where all old kwargs become positional. + new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # BC: (the opposite case) A new schema where all old positional arguments become kwarg. self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[] dim, bool keepdim=False, bool? extra=None) -> Tensor') + # No-BC: A new schema where old kwargs appear in different order. + new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - - def test_backward_incompatible_kwargs(self): - old_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False) -> Tensor') - new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim) -> Tensor') + # BC: A new schema where argument becomes of type optional. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # BC: A new schema where argument gains a default value. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor') + self.assertTrue(new_schema.is_backward_compatible_with(old_schema)) + # No-BC: A new schema where argument is "renamed". + new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertTrue(old_schema.is_backward_compatible_with(new_schema)) - new_schema = parse_schema('any(Tensor self, int[] dims, *, bool keepdim=False, bool extra) -> Tensor') + # No-BC: A new schema where argument type changes to an incompatible type. + new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor') self.assertFalse(new_schema.is_backward_compatible_with(old_schema)) - self.assertFalse(old_schema.is_backward_compatible_with(new_schema)) - if __name__ == '__main__': run_tests() diff --git a/test/test_fx.py b/test/test_fx.py index 9666ccd395bf..1451c5efe5cb 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -9,7 +9,7 @@ from torch.fx.experimental import GraphManipulation from torch.fx.experimental import shape_prop from torch.fx.experimental.Partitioner import DAG, Partitioner -from torch.fx.experimental.subgraph_creation_example import split_module +from torch.fx.experimental.subgraph_creation_example import split_module from torch.fx.proxy import TraceError @@ -30,6 +30,9 @@ class SimpleTest(torch.nn.Module): def forward(self, x): return torch.relu(x + 3.0) +def a_non_torch_leaf(a, b): + return a + b + class TestFX(JitTestCase): def checkGraphModule(self, m: torch.nn.Module, args, kwargs=None): """Check that an nn.Module's results match the GraphModule version @@ -84,6 +87,17 @@ def forward(self, A, b=4, *args, c=5, **kwargs): t = T() symbolic_trace(t) + def test_custom_import(self): + graph = torch.fx.Graph() + a = graph.placeholder('x') + b = graph.placeholder('y') + c = graph.call_function(a_non_torch_leaf, (a, b)) + d = graph.call_function(torch.sin, (c,)) + graph.output(d) + gm = GraphModule(torch.nn.Module(), graph) + x, y = torch.rand(1), torch.rand(1) + self.assertEqual(torch.sin(x + y), gm(x, y)) + def test_args_kwargs(self): class T(torch.nn.Module): def forward(self, *args, **kwargs): @@ -803,8 +817,8 @@ def __init__(self): self.linear = torch.nn.Linear(4, 5) def forward(self, x, y): - z = self.linear(x + self.param).clamp(min=0.0, max=1.0) - w = self.linear(y).clamp(min=0.0, max=1.0) + z = self.linear(x + self.param).clamp(min=0.0, max=1.0) + w = self.linear(y).clamp(min=0.0, max=1.0) return z + w # symbolically trace model @@ -821,7 +835,7 @@ def mod_partition(node: Node): partition_counter = (partition_counter + 1) % NPARTITIONS return partition - # split module in module with submodules + # split module in module with submodules module_with_submodules = split_module(my_module_traced, my_module, mod_partition) x = torch.rand(3, 4) @@ -832,5 +846,107 @@ def mod_partition(node: Node): self.assertEqual(orig_out, submodules_out) + @skipIfNoTorchVision + def test_replace_uses(self): + rn18 = resnet18() + + class LowerReluTracer(torch.fx.Tracer): + def is_leaf_module(self, m : torch.nn.Module, qualname : str): + if isinstance(m, torch.nn.ReLU): + return False + return super().is_leaf_module(m, qualname) + + rn18_traced = GraphModule(rn18, LowerReluTracer().trace(rn18)) + + to_erase = [] + for node in rn18_traced.graph.nodes: + if node.op == 'call_function' and node.target in [torch.relu, torch.nn.functional.relu]: + kwargs = node.kwargs + # Neg doesn't have in-place + kwargs.pop('inplace') + with torch.fx.graph.insert_before(node): + new_node = rn18_traced.graph.call_function( + the_function=torch.neg, args=node.args, kwargs=node.kwargs) + node.replace_all_uses_with(replace_with=new_node) + to_erase.append(node) + + for node in to_erase: + rn18_traced.graph.erase_node(node) + + def test_insertion_point(self): + graph : torch.fx.Graph = torch.fx.Graph() + x : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,)) + output : torch.fx.Node = graph.output(b) + + with torch.fx.graph.insert_before(b): + neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,)) + _, *relu_args = b.args + b.args = (neg, *relu_args) + + gm = torch.fx.GraphModule(torch.nn.Module(), graph) + + input = torch.randn(33, 44) + self.assertEqual(gm(input), torch.relu(torch.neg(input))) + + + def test_move_before(self): + graph : torch.fx.Graph = torch.fx.Graph() + x : torch.fx.Node = graph.create_node('placeholder', 'x') + b : torch.fx.Node = graph.create_node('call_function', target=torch.relu, args=(x,)) + output : torch.fx.Node = graph.output(b) + + neg : torch.fx.Node = graph.call_function(the_function=torch.neg, args=(x,)) + _, *relu_args = b.args + b.args = (neg, *relu_args) + graph.move_node_before(to_move=neg, before=b) + + gm = torch.fx.GraphModule(torch.nn.Module(), graph) + + input = torch.randn(33, 44) + self.assertEqual(gm(input), torch.relu(torch.neg(input))) + + def test_erase_node_error(self): + st = SimpleTest() + traced = symbolic_trace(st) + + for node in traced.graph.nodes: + # Test deleting with uses both in another Node and at the output + if node.target in [operator.add, torch.relu]: + with self.assertRaisesRegex(RuntimeError, 'but it still had .* uses in the graph!'): + traced.graph.erase_node(node) + + def test_find_uses(self): + graph = torch.fx.Graph() + x = torch.fx.Proxy(graph.placeholder('x')) + + y = torch.relu(x) + z = x + x + u = torch.neg(x) + graph.output((y + z + u).node) + graph.lint() + + uses_of_x = x.node.find_uses() + self.assertEqual(len(uses_of_x), 3) + expected_ops = ['relu', 'add', 'neg'] + for node, expected in zip(uses_of_x, expected_ops): + assert expected in node.name + + def test_multi_insert_point(self): + graph = torch.fx.Graph() + x = torch.fx.Proxy(graph.placeholder('x')) + relu = torch.relu(x) + + with torch.fx.graph.insert_before(relu.node): + y = torch.neg(x) + z = torch.tanh(y) + + graph.output((relu.node, z.node)) + graph.lint() + + expected_ops = ['x', 'neg', 'tanh', 'relu'] + for node, expected in zip(graph.nodes, expected_ops): + assert expected in node.name + if __name__ == '__main__': run_tests() diff --git a/test/test_jit.py b/test/test_jit.py index 494d70ecfcfb..d093a4b8826e 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -10010,6 +10010,21 @@ def method(self, x): with self.assertRaisesRegex(RuntimeError, "Argument y not provided."): ModuleDefault() + def test_type_inferred_from_empty_annotation(self): + """ + Test that the type inferred from an empty or missing annotation is Torch.Tensor wtih `inferred=true` + """ + @torch.jit.script + def fn(x): + return x + + graph = fn.graph + n = next(graph.inputs()) + self.assertTrue(n.type() == torch._C.TensorType.getInferred()) + + with self.assertRaisesRegex(RuntimeError, "Inferred \'x\' to be of type \'Tensor"): + fn(1) + def test_script_define_order(self): class M(torch.jit.ScriptModule): diff --git a/test/test_jit_cuda_fuser_legacy.py b/test/test_jit_cuda_fuser_legacy.py index 41e16df7d686..28ab78370637 100644 --- a/test/test_jit_cuda_fuser_legacy.py +++ b/test/test_jit_cuda_fuser_legacy.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=legacy") +sys.argv.append("--jit_executor=legacy") import os os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' diff --git a/test/test_jit_cuda_fuser_profiling.py b/test/test_jit_cuda_fuser_profiling.py index 7559b85519c4..5114ab190457 100644 --- a/test/test_jit_cuda_fuser_profiling.py +++ b/test/test_jit_cuda_fuser_profiling.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=profiling") +sys.argv.append("--jit_executor=profiling") import os os.environ['PYTORCH_CUDA_FUSER_DISABLE_FALLBACK'] = '1' diff --git a/test/test_jit_fuser_legacy.py b/test/test_jit_fuser_legacy.py index c33983e45e79..420075f6e611 100644 --- a/test/test_jit_fuser_legacy.py +++ b/test/test_jit_fuser_legacy.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=legacy") +sys.argv.append("--jit_executor=legacy") from test_jit_fuser import * if __name__ == '__main__': diff --git a/test/test_jit_legacy.py b/test/test_jit_legacy.py index 2422e518a7f9..b17908e910bb 100644 --- a/test/test_jit_legacy.py +++ b/test/test_jit_legacy.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=legacy") +sys.argv.append("--jit_executor=legacy") from test_jit import * if __name__ == '__main__': diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py index be02985e69a8..dc6bb2fbf878 100644 --- a/test/test_jit_profiling.py +++ b/test/test_jit_profiling.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=profiling") +sys.argv.append("--jit_executor=profiling") from test_jit import * if __name__ == '__main__': diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py index 910e4a17713d..23da6602c572 100644 --- a/test/test_jit_simple.py +++ b/test/test_jit_simple.py @@ -1,5 +1,5 @@ import sys -sys.argv.append("--ge_config=simple") +sys.argv.append("--jit_executor=simple") from test_jit import * if __name__ == '__main__': diff --git a/test/test_nn.py b/test/test_nn.py index e7a185c08951..ccf4ea7aa8d1 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -10130,6 +10130,34 @@ def test_GroupNorm_empty(self, device): with torch.backends.cudnn.flags(enabled=False): self._test_module_empty_input(mod, inp) + @onlyOnCPUAndCUDA + def test_ReplicationPad_empty(self, device): + for mod, inp in [ + (torch.nn.ReplicationPad1d(3), torch.randn(0, 3, 10, device=device)), + (torch.nn.ReplicationPad2d(3), torch.randn(0, 3, 10, 10, device=device)), + (torch.nn.ReplicationPad3d(3), torch.randn(0, 3, 10, 10, 10, device=device))]: + self._test_module_empty_input(mod, inp, check_size=False) + + with self.assertRaisesRegex(NotImplementedError, 'Only 3D'): + mod = torch.nn.ReplicationPad1d(2) + inp = torch.randn(3, 10, device=device) + mod(inp) + + with self.assertRaisesRegex(RuntimeError, 'Expected 2D or 3D'): + mod = torch.nn.ReplicationPad1d(2) + inp = torch.randn(3, 0, 10, device=device) + mod(inp) + + with self.assertRaisesRegex(RuntimeError, 'Expected 3D or 4D'): + mod = torch.nn.ReplicationPad2d((2, 2, 2, 2)) + inp = torch.randn(43, 0, 10, 10, device=device) + mod(inp) + + with self.assertRaisesRegex(RuntimeError, 'Expected 4D or 5D'): + mod = torch.nn.ReplicationPad3d((2, 2, 2, 2, 2, 2)) + inp = torch.randn(3, 0, 10, 10, 10, device=device) + mod(inp) + @onlyOnCPUAndCUDA def test_ReflectionPad_empty(self, device): for mod, inp in [ diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index d7ef731699b3..82ed2225bda8 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -510,6 +510,81 @@ def test_fftn_invalid(self, device): with self.assertRaisesRegex(RuntimeError, "Expected a real input"): torch.fft.rfftn(c) + # Helper functions + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @dtypes(torch.float, torch.double) + def test_fftfreq_numpy(self, device, dtype): + test_args = [ + *product( + # n + range(1, 20), + # d + (None, 10.0), + ) + ] + + functions = ['fftfreq', 'rfftfreq'] + + for fname in functions: + torch_fn = getattr(torch.fft, fname) + numpy_fn = getattr(np.fft, fname) + + for n, d in test_args: + args = (n,) if d is None else (n, d) + expected = numpy_fn(*args) + actual = torch_fn(*args, device=device, dtype=dtype) + self.assertEqual(actual, expected, exact_dtype=False) + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @dtypes(torch.float, torch.double, torch.complex64, torch.complex128) + def test_fftshift_numpy(self, device, dtype): + test_args = [ + # shape, dim + *product(((11,), (12,)), (None, 0, -1)), + *product(((4, 5), (6, 6)), (None, 0, (-1,))), + *product(((1, 1, 4, 6, 7, 2),), (None, (3, 4))), + ] + + functions = ['fftshift', 'ifftshift'] + + for shape, dim in test_args: + input = torch.rand(*shape, device=device, dtype=dtype) + input_np = input.cpu().numpy() + + for fname in functions: + torch_fn = getattr(torch.fft, fname) + numpy_fn = getattr(np.fft, fname) + + expected = numpy_fn(input_np, axes=dim) + actual = torch_fn(input, dim=dim) + self.assertEqual(actual, expected) + + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @unittest.skipIf(not TEST_NUMPY, 'NumPy not found') + @dtypes(torch.float, torch.double) + def test_fftshift_frequencies(self, device, dtype): + for n in range(10, 15): + sorted_fft_freqs = torch.arange(-(n // 2), n - (n // 2), + device=device, dtype=dtype) + x = torch.fft.fftfreq(n, d=1 / n, device=device, dtype=dtype) + + # Test fftshift sorts the fftfreq output + shifted = torch.fft.fftshift(x) + self.assertTrue(torch.allclose(shifted, shifted.sort().values)) + self.assertEqual(sorted_fft_freqs, shifted) + + # And ifftshift is the inverse + self.assertEqual(x, torch.fft.ifftshift(shifted)) + # Legacy fft tests def _test_fft_ifft_rfft_irfft(self, device, dtype): def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x): diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index eaaee2dab836..b0777c7fa12a 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -6,7 +6,7 @@ import torch from torch.testing._internal.common_utils import \ - (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, suppress_warnings, + (TestCase, run_tests, do_test_empty_full, TEST_NUMPY, TEST_WITH_ROCM, suppress_warnings, torch_to_numpy_dtype_dict, slowTest) from torch.testing._internal.common_device_type import \ (instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, @@ -1048,7 +1048,9 @@ def test_logspace_special_steps(self, device, dtype): self._test_logspace_base2(device, dtype, steps=steps) @dtypes(*torch.testing.get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) - @dtypesIfCUDA(*torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False)) + @dtypesIfCUDA(*((torch.testing.get_all_int_dtypes() + [torch.float32, torch.float16, torch.bfloat16]) + if TEST_WITH_ROCM + else torch.testing.get_all_dtypes(include_bool=False, include_half=True, include_complex=False))) def test_logspace(self, device, dtype): _from = random.random() to = _from + random.random() diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 143c6dab91d2..739957569962 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -1046,18 +1046,18 @@ def easy(x, y): # FIXME: interp.elapsed_value() also increments due to simplifier assert llvm.elapsed_value() == 1 or interp.elapsed_value() > 1 - def test_unsqueeze(self): + def test_unsqueeze(self, N=256): def easy(x, y): a = torch.unsqueeze(x, 0) b = torch.unsqueeze(y, 0) return a + b - traced = torch.jit.trace(easy, (torch.ones(1024, 1024), torch.zeros(1024, 1024))) + traced = torch.jit.trace(easy, (torch.ones(N, N), torch.zeros(N, N))) llvm = LLVMCodeGenExecuted() interp = SimpleIREvalExecuted() - a = torch.rand(1024, 1024) + a = torch.rand(N, N) x = traced(a, a) npr = np.expand_dims(a, 0) npr = npr + npr diff --git a/test/test_torch.py b/test/test_torch.py index 0458e323b78f..7da38b211dc5 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -16684,7 +16684,9 @@ def _test_addmm_addmv(self, f, t, m, v, *, alpha=None, beta=None, transpose_out= @precisionOverride({torch.bfloat16: 1e-0, torch.half: 5e-4, torch.float: 1e-4, torch.double: 1e-8, torch.cfloat: 1e-4, torch.cdouble: 1e-8}) - @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), *torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM)) + @dtypesIfCUDA(*torch.testing.get_all_complex_dtypes(), + *([torch.float32, torch.float64, torch.bfloat16] + if TEST_WITH_ROCM else torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM))) @dtypes(torch.bfloat16, torch.float, torch.double, torch.cfloat, torch.cdouble) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_addmv(self, device, dtype): diff --git a/test/test_xnnpack_integration.py b/test/test_xnnpack_integration.py index a40ec48f2f37..56c44b904b47 100644 --- a/test/test_xnnpack_integration.py +++ b/test/test_xnnpack_integration.py @@ -12,10 +12,12 @@ import io import itertools +from torch.testing._internal.common_utils import TEST_WITH_TSAN @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKOps(TestCase): @given(batch_size=st.integers(0, 3), data_shape=hu.array_shapes(1, 3, 2, 64), @@ -161,6 +163,7 @@ def test_conv2d_transpose(self, @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKSerDes(TestCase): @given(batch_size=st.integers(0, 3), data_shape=hu.array_shapes(1, 3, 2, 64), @@ -551,6 +554,7 @@ def forward(self, x): @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN fails with XNNPACK. Does not seem to have a good reason for failures.") class TestXNNPACKRewritePass(TestCase): @staticmethod def validate_transformed_module( @@ -911,6 +915,7 @@ def forward(self, x): @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") +@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment") class TestXNNPACKConv1dTransformPass(TestCase): @staticmethod def validate_transform_conv1d_to_conv2d( diff --git a/third_party/fbgemm b/third_party/fbgemm index 1d710393d5b7..fe9164007c33 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 1d710393d5b7588f5de3b83f51c22bbddf095229 +Subproject commit fe9164007c3392a12ea51a19b0f4e9f40d24f88d diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp index 7d0186538c98..1dbdca565792 100644 --- a/tools/autograd/templates/python_fft_functions.cpp +++ b/tools/autograd/templates/python_fft_functions.cpp @@ -7,14 +7,27 @@ #include "torch/csrc/autograd/python_variable.h" #include "torch/csrc/autograd/utils/wrap_outputs.h" #include "torch/csrc/autograd/utils/python_arg_parsing.h" +#include "torch/csrc/autograd/generated/variable_factories.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/structseq.h" +#include "torch/csrc/utils/cuda_lazy_init.h" + +#include using at::Tensor; +using at::Device; +using at::Layout; using at::Scalar; -using at::MemoryFormat; -using at::Generator; +using at::ScalarType; +using at::Backend; +using at::OptionalDeviceGuard; +using at::DeviceGuard; +using at::TensorOptions; using at::IntArrayRef; +using at::Generator; +using at::TensorList; +using at::Dimname; +using at::DimnameList; using namespace torch::autograd::utils; diff --git a/tools/clang_format_all.py b/tools/clang_format_all.py index 710a21e33514..77ca68d92b0b 100755 --- a/tools/clang_format_all.py +++ b/tools/clang_format_all.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -A script that runs clang-format on all C/C++ files in CLANG_FORMAT_WHITELIST. There is +A script that runs clang-format on all C/C++ files in CLANG_FORMAT_ALLOWLIST. There is also a diff mode which simply checks if clang-format would make any changes, which is useful for CI purposes. @@ -14,22 +14,22 @@ import sys from clang_format_utils import get_and_check_clang_format, CLANG_FORMAT_PATH -# Whitelist of directories to check. All files that in that directory +# Allowlist of directories to check. All files that in that directory # (recursively) will be checked. -# If you edit this, please edit the whitelist in clang_format_ci.sh as well. -CLANG_FORMAT_WHITELIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"] +# If you edit this, please edit the allowlist in clang_format_ci.sh as well. +CLANG_FORMAT_ALLOWLIST = ["torch/csrc/jit/", "test/cpp/jit/", "test/cpp/tensorexpr/"] # Only files with names matching this regex will be formatted. CPP_FILE_REGEX = re.compile(".*\\.(h|cpp|cc|c|hpp)$") -def get_whitelisted_files(): +def get_allowlisted_files(): """ - Parse CLANG_FORMAT_WHITELIST and resolve all directories. - Returns the set of whitelist cpp source files. + Parse CLANG_FORMAT_ALLOWLIST and resolve all directories. + Returns the set of allowlist cpp source files. """ matches = [] - for dir in CLANG_FORMAT_WHITELIST: + for dir in CLANG_FORMAT_ALLOWLIST: for root, dirnames, filenames in os.walk(dir): for filename in filenames: if CPP_FILE_REGEX.match(filename): @@ -77,7 +77,7 @@ async def file_clang_formatted_correctly(filename, semaphore, verbose=False): async def run_clang_format(max_processes, diff=False, verbose=False): """ - Run clang-format to all files in CLANG_FORMAT_WHITELIST that match CPP_FILE_REGEX. + Run clang-format to all files in CLANG_FORMAT_ALLOWLIST that match CPP_FILE_REGEX. """ # Check to make sure the clang-format binary exists. if not os.path.exists(CLANG_FORMAT_PATH): @@ -97,7 +97,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False): # Format files in parallel. if diff: - for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_whitelisted_files()]): + for f in asyncio.as_completed([file_clang_formatted_correctly(f, semaphore, verbose) for f in get_allowlisted_files()]): ok &= await f if ok: @@ -105,7 +105,7 @@ async def run_clang_format(max_processes, diff=False, verbose=False): else: print("Some files not formatted correctly") else: - await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_whitelisted_files()]) + await asyncio.gather(*[run_clang_format_on_file(f, semaphore, verbose) for f in get_allowlisted_files()]) return ok @@ -134,7 +134,7 @@ def main(args): options = parse_args(args) # Get clang-format and make sure it is the right binary and it is in the right place. ok = get_and_check_clang_format(options.verbose) - # Invoke clang-format on all files in the directories in the whitelist. + # Invoke clang-format on all files in the directories in the allowlist. if ok: loop = asyncio.get_event_loop() ok = loop.run_until_complete(run_clang_format(options.max_processes, options.diff, options.verbose)) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index b0479c01f58a..9ccc5f7cb899 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -709,6 +709,8 @@ class EnumType(JitType): class TensorType(JitType): @classmethod def get(cls) -> TensorType: ... + @classmethod + def getInferred(cls) -> TensorType: ... # Defined in torch/csrc/jit/python/python_tree_views.cpp class SourceRange: diff --git a/torch/autograd/functional.py b/torch/autograd/functional.py index 2a1d0ef55fd9..58e780c87d1b 100644 --- a/torch/autograd/functional.py +++ b/torch/autograd/functional.py @@ -381,15 +381,15 @@ def jacobian(func, inputs, create_graph=False, strict=False): Defaults to ``False``. Returns: - Jacobian (Tensor or nested tuple of Tensors): if there are a single - input and output, this will be a single Tensor containing the - Jacobian for the linearized inputs and output. If one of the two is - a tuple, then the Jacobian will be a tuple of Tensors. If both of - them are tuples, then the Jacobian will be a tuple of tuple of - Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the - ``i``\th output and ``j``\th input and will have as size the - concatenation of the sizes of the corresponding output and the - corresponding input. + Jacobian (Tensor or nested tuple of Tensors): if there is a single + input and output, this will be a single Tensor containing the + Jacobian for the linearized inputs and output. If one of the two is + a tuple, then the Jacobian will be a tuple of Tensors. If both of + them are tuples, then the Jacobian will be a tuple of tuple of + Tensors where ``Jacobian[i][j]`` will contain the Jacobian of the + ``i``\th output and ``j``\th input and will have as size the + concatenation of the sizes of the corresponding output and the + corresponding input. Example: @@ -476,12 +476,12 @@ def hessian(func, inputs, create_graph=False, strict=False): Defaults to ``False``. Returns: - Hessian (Tensor or a tuple of tuple of Tensors) if there are a single input, - this will be a single Tensor containing the Hessian for the input. - If it is a tuple, then the Hessian will be a tuple of tuples where - ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input - and ``j``\th input with size the sum of the size of the ``i``\th input plus - the size of the ``j``\th input. + Hessian (Tensor or a tuple of tuple of Tensors): if there is a single input, + this will be a single Tensor containing the Hessian for the input. + If it is a tuple, then the Hessian will be a tuple of tuples where + ``Hessian[i][j]`` will contain the Hessian of the ``i``\th input + and ``j``\th input with size the sum of the size of the ``i``\th input plus + the size of the ``j``\th input. Example: @@ -660,7 +660,9 @@ def hvp(func, inputs, v=None, create_graph=False, strict=False): hvp for said inputs, which is the expected mathematical value. Defaults to ``False``. Returns: - func_output (tuple of Tensors or Tensor): output of ``func(inputs)`` + output (tuple): tuple with: + func_output (tuple of Tensors or Tensor): output of ``func(inputs)`` + hvp (tuple of Tensors or Tensor): result of the dot product with the same shape as the inputs. diff --git a/torch/csrc/api/include/torch/fft.h b/torch/csrc/api/include/torch/fft.h index 1c119ed75226..8a094ec9e235 100644 --- a/torch/csrc/api/include/torch/fft.h +++ b/torch/csrc/api/include/torch/fft.h @@ -166,4 +166,66 @@ inline Tensor ihfft(const Tensor& self, return torch::fft_ihfft(self, n, dim, norm); } +/// Computes the discrete Fourier Transform sample frequencies for a signal of size n. +/// +/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftfreq +/// +/// Example: +/// ``` +/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble); +/// ``` +inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options={}) { + return torch::fft_fftfreq(n, d, options); +} + +inline Tensor fftfreq(int64_t n, const TensorOptions& options={}) { + return torch::fft_fftfreq(n, /*d=*/1.0, options); +} + +/// Computes the sample frequencies for torch.fft.rfft with a signal of size n. +/// +/// Like torch.fft.rfft, only the positive frequencies are included. +/// See https://pytorch.org/docs/master/fft.html#torch.fft.rfftfreq +/// +/// Example: +/// ``` +/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble); +/// ``` +inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) { + return torch::fft_rfftfreq(n, d, options); +} + +inline Tensor rfftfreq(int64_t n, const TensorOptions& options) { + return torch::fft_rfftfreq(n, /*d=*/1.0, options); +} + +/// Reorders n-dimensional FFT output to have negative frequency terms first, by +/// a torch.roll operation. +/// +/// See https://pytorch.org/docs/master/fft.html#torch.fft.fftshift +/// +/// Example: +/// ``` +/// auto x = torch::randn({127, 4}); +/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x)); +/// ``` +inline Tensor fftshift(const Tensor& x, c10::optional dim=c10::nullopt) { + return torch::fft_fftshift(x, dim); +} + +/// Inverse of torch.fft.fftshift +/// +/// See https://pytorch.org/docs/master/fft.html#torch.fft.ifftshift +/// +/// Example: +/// ``` +/// auto x = torch::randn({127, 4}); +/// auto shift = torch::fft::fftshift(x) +/// auto unshift = torch::fft::ifftshift(shift); +/// assert(torch::allclose(x, unshift)); +/// ``` +inline Tensor ifftshift(const Tensor& x, c10::optional dim=c10::nullopt) { + return torch::fft_ifftshift(x, dim); +} + }} // torch::fft diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 5ddaf4a4855d..e952b0afc772 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -513,12 +513,10 @@ void GraphTask::exec_post_processing() { } void GraphTask::set_exception_without_signal(const std::shared_ptr& fn) { - std::unique_lock lock(mutex_); - if (!has_error_.load()) { + if (!has_error_.exchange(true)) { if (AnomalyMode::is_enabled() && fn) { fn->metadata()->print_stack(fn->name()); } - has_error_ = true; } } diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index 6cef307c7cce..780b129ab922 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -21,6 +21,10 @@ ncclComm_t* to_nccl_comm(torch::cuda::nccl::ncclComm_t* var) { return reinterpret_cast(var); } +ncclComm_t to_nccl_comm(torch::cuda::nccl::ncclComm_t var) { + return reinterpret_cast(var); +} + ncclUniqueId* to_nccl_unique_id(torch::cuda::nccl::ncclUniqueId* var) { return reinterpret_cast(var); } @@ -107,16 +111,20 @@ using namespace at; namespace detail { +static inline void NCCL_CHECK(ncclResult_t result) { + NCCL_CHECK(from_nccl_result(result)); +} + struct AutoNcclGroup { AutoNcclGroup() { (c10::cuda::CUDACachingAllocator::getFreeMutex())->lock(); #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - NCCL_CHECK(from_nccl_result(ncclGroupStart())); + NCCL_CHECK(ncclGroupStart()); #endif } ~AutoNcclGroup() { #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - NCCL_CHECK(from_nccl_result(ncclGroupEnd())); + NCCL_CHECK(ncclGroupEnd()); #endif (c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock(); } @@ -133,8 +141,8 @@ struct NcclCommList { int ndevices; NcclCommList(const std::vector& devices) : comms(new ncclComm_t[devices.size()]), ndevices(devices.size()) { - NCCL_CHECK(from_nccl_result( - ncclCommInitAll(to_nccl_comm(comms.get()), devices.size(), devices.data()))); + NCCL_CHECK( + ncclCommInitAll(to_nccl_comm(comms.get()), devices.size(), devices.data())); } NcclCommList(NcclCommList&& foo) = default; ~NcclCommList() { @@ -326,7 +334,7 @@ void get_unique_id(ncclUniqueId& id) { #ifdef USE_NCCL using namespace torch::cuda::nccl::detail; - NCCL_CHECK(from_nccl_result(ncclGetUniqueId(to_nccl_unique_id(&id)))); + NCCL_CHECK(ncclGetUniqueId(to_nccl_unique_id(&id))); #else AT_ERROR("PyTorch built without NCCL support"); #endif @@ -337,11 +345,11 @@ ncclComm_t comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank) { using namespace torch::cuda::nccl::detail; ncclComm_t comm; ncclUniqueId id = comm_id; - NCCL_CHECK(from_nccl_result(ncclCommInitRank( + NCCL_CHECK(ncclCommInitRank( to_nccl_comm(&comm), nranks, *(to_nccl_unique_id(&id)), - rank))); + rank)); return comm; #else return nullptr; @@ -362,8 +370,7 @@ void comm_destroy(ncclComm_t comm) #ifdef USE_NCCL using namespace torch::cuda::nccl::detail; - NCCL_CHECK(from_nccl_result(ncclCommDestroy( - *(to_nccl_comm(&comm))))); + NCCL_CHECK(ncclCommDestroy(to_nccl_comm(comm))); #endif } @@ -420,8 +427,8 @@ void broadcast( count_max, ")"); ncclComm_t comm = comms[i]; - NCCL_CHECK(from_nccl_result(ncclBcast( - tensors[i].data_ptr(), numel, data_type, 0, *(to_nccl_comm(&comm)), stream))); + NCCL_CHECK(ncclBcast( + tensors[i].data_ptr(), numel, data_type, 0, to_nccl_comm(comm), stream)); } #else AT_ERROR("PyTorch built without NCCL support"); @@ -460,15 +467,15 @@ void reduce( : streams[i]->stream(); ncclComm_t comm = comms_ref[i]; - NCCL_CHECK(from_nccl_result(ncclReduce( + NCCL_CHECK(ncclReduce( inputs[i].data_ptr(), root == i ? output.data_ptr() : nullptr, count, data_type, to_nccl_red_op(op), root, - *(to_nccl_comm(&comm)), - stream))); + to_nccl_comm(comm), + stream)); } #else AT_ERROR("PyTorch built without NCCL support"); @@ -512,14 +519,14 @@ void all_reduce( : streams[i]->stream(); ncclComm_t comm = comms_ref[i]; - NCCL_CHECK(from_nccl_result(ncclAllReduce( + NCCL_CHECK(ncclAllReduce( inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, to_nccl_red_op(op), - *(to_nccl_comm(&comm)), - stream))); + to_nccl_comm(comm), + stream)); } #else AT_ERROR("PyTorch built without NCCL support"); @@ -554,14 +561,14 @@ void reduce_scatter( : streams[i]->stream(); ncclComm_t comm = comms_ref[i]; - NCCL_CHECK(from_nccl_result(ncclReduceScatter( + NCCL_CHECK(ncclReduceScatter( inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, to_nccl_red_op(op), - *(to_nccl_comm(&comm)), - stream))); + to_nccl_comm(comm), + stream)); } #else AT_ERROR("PyTorch built without NCCL support"); @@ -596,21 +603,21 @@ void all_gather( ncclComm_t comm = comms_ref[i]; #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) - NCCL_CHECK(from_nccl_result(ncclAllGather( + NCCL_CHECK(ncclAllGather( inputs[i].data_ptr(), outputs[i].data_ptr(), count, data_type, - *(to_nccl_comm(&comm)), - stream))); + to_nccl_comm(comm), + stream)); #else - NCCL_CHECK(from_nccl_result(ncclAllGather( + NCCL_CHECK(ncclAllGather( inputs[i].data_ptr(), count, data_type, outputs[i].data_ptr(), - *(to_nccl_comm(&comm)), - stream))); + to_nccl_comm(comm), + stream)); #endif } #else diff --git a/torch/csrc/distributed/c10d/c10d_frontend.h b/torch/csrc/distributed/c10d/c10d_frontend.h index 9ff4b69999c7..816c8d9fe473 100644 --- a/torch/csrc/distributed/c10d/c10d_frontend.h +++ b/torch/csrc/distributed/c10d/c10d_frontend.h @@ -1,86 +1,226 @@ #pragma once -#include -#include #include #include +#include +#include +#include +#include +#include #include #include -#include -#include namespace c10d { class Backend { - public: - // Maps to Backend.__new__ in Python. - static std::string get(std::string); + public: + // Maps to Backend.__new__ in Python. + static std::string get(std::string); - // TODO: How to support registering third_party backend? - static void registerBackend(); + // TODO: How to support registering third_party backend? + static void registerBackend(); - private: - // TODO: Should this be an enum list instead since this set doesn't - // change at all. - std::unordered_set registered_backends_; + private: + // TODO: Should this be an enum list instead since this set doesn't + // change at all. + std::unordered_set registered_backends_; }; -class DistributedC10d{ - public: - void initProcessGroup( - const std::string& backend, - const std::string& init_method, - const std::chrono::milliseconds& timeout, - int64_t world_size, - int64_t rank, - std::shared_ptr store, - const std::string& group_name); - - void destroyProcessGroup(std::shared_ptr group); - int64_t getRank(std::shared_ptr group); - int64_t getWorldSize(std::shared_ptr group); - - ProcessGroup::Work isend(at::Tensor tensor, int64_t dst, std::shared_ptr group, c10::optional tag); - ProcessGroup::Work irecv(at::Tensor tensor, int64_t src, std::shared_ptr group, c10::optional tag); - - private: - DistributedC10d(){}; - - bool rankNotInGroup(std::shared_ptr group) const; - int64_t getGroupRank( - std::shared_ptr group, - const int64_t rank) const; - int64_t getGlobalRank( - std::shared_ptr group, - const int64_t global_rank) const; - void checkDefaultPg() const; - int64_t getGroupSize(std::shared_ptr group) const; - int64_t getBackend(std::shared_ptr group); - - std::string backend_; - // TODO: Ask Alex what kind of equality we need. It determine whether we - // need to use ProcessGroup or ProcesGroup* as key. - std::unordered_map< - std::shared_ptr, - std::pair, std::shared_ptr>> - pg_map_; - - // Note, this is different mapping relationship than original Python - // implementation. - std::unordered_map, std::string> pg_names_; - - // Value is global_rank:group_rank mapping. - std::unordered_map, std::vector> - pg_group_ranks_; - - std::shared_ptr default_pg_; - - // Default value should be "env://" - std::string default_pg_init_method_; - - int64_t group_count_; +class DistributedC10d { + public: + void initProcessGroup( + const std::string& backend, + const std::string& init_method, + const std::chrono::milliseconds& timeout, + int64_t world_size, + int64_t rank, + std::shared_ptr store, + const std::string& group_name); + + void destroyProcessGroup(std::shared_ptr group); + int64_t getRank(std::shared_ptr group); + int64_t getWorldSize(std::shared_ptr group); + + ProcessGroup::Work isend( + at::Tensor tensor, + int64_t dst, + std::shared_ptr group, + c10::optional tag); + + ProcessGroup::Work irecv( + at::Tensor tensor, + int64_t src, + std::shared_ptr group, + c10::optional tag); + + ProcessGroup::Work send( + at::Tensor tensor, + int64_t dst, + std::shared_ptr group, + c10::optional tag); + + ProcessGroup::Work recv( + at::Tensor tensor, + int64_t src, + std::shared_ptr group, + c10::optional tag); + + c10::optional broadcastMultiGPU( + std::vector tensor_list, + int64_t src, + std::shared_ptr group, + bool async_op, + int64_t src_tensor); + + c10::optional broadcast( + at::Tensor tensor, + int64_t src, + std::shared_ptr group, + bool async_op); + + c10::optional allReduceMultiGPU( + std::vector& tensor_list, + ReduceOp op, + std::shared_ptr group, + bool async_op); + + c10::optional allReduce( + at::Tensor tensor, + ReduceOp op, + std::shred_ptr group, + bool async_op); + + c10::optional allReduceCoalesced( + at::Tensor tensor, + ReduceOp op, + std::shred_ptr group, + bool async_op); + + c10::optional reduceMultiGPU( + std::vector& tensor_list, + int64_t dst, + ReduceOp op, + std::shared_ptr group, + bool async_op, + int64_t dst_tensor); + + c10::optional reduce( + at::Tensor tensor, + int64_t dst, + ReduceOp op, + std::shared_ptr& group, + bool async_op); + + c10::optional allGatherMultiGPU( + std::vector>& output_tensor_lists, + const std::vector& input_tensor_list, + std::shared_ptr group, + bool async_op); + + // TODO TODO following APIs take python objects and unpickle them, how do we support these? + // ProcessGroup::Work allGatherObject() + // ProcessGroup::Work gatherObject() + // ProcessGroup::Work broadcastObjectList() + + c10::optional allGather( + std::vector& tensor_list, + at::Tensor tensor, + std::shared_ptr group, + bool async_op); + + c10::optional allGatherCoalesced( + std::vector>& output_tensor_lists, + std::vector& input_tensor_list, + std::shared_ptr group, + bool async_op); + + c10::optional gather( + at::Tensor tensor, + std::vector& gather_list, + int64_t dst, + std::shared_ptr group, + bool async_op); + + c10::optional scatter( + at::Tensor tensor, + std::vector& scatter_list, + int64_t dst, + std::shared_ptr group, + bool async_op); + + ProcessGroup::Work reduceScatterMultiGPU( + std::vector& output_tensor_list, + const std::vector>& input_tensor_lists, + ReduceOp op, + std::shared_ptr group, + bool async_op); + + ProcessGroup::Work reduceScatter( + at::Tensor output, + const std::vector& input_list, + ReduceOp op, + std::shared_ptr group, + bool async_op); + + ProcessGroup::Work allToAllSingle( + at::Tensor output, + at::Tensor input, + const std::vector& output_split_sizes, + const std::vector& input_split_sizes, + std::shared_ptr group, + bool async_op); + + ProcessGroup::Work allToAll( + std::vector& output_tensor_list, + const std::vector& input_tensor_list, + std::shared_ptr group, + bool async_op); + + ProcessGroup::Work barrier( + std::shared_ptr group, + bool async_op); + + std::shared_ptr newGroup( + std::vector ranks, + std::chrono::milliseconds timeout, + Backend backend); + + private: + DistributedC10d(){}; + + bool rankNotInGroup(std::shared_ptr group) const; + int64_t getGroupRank(std::shared_ptr group, const int64_t rank) + const; + int64_t getGlobalRank( + std::shared_ptr group, + const int64_t global_rank) const; + void checkDefaultPg() const; + int64_t getGroupSize(std::shared_ptr group) const; + int64_t getBackend(std::shared_ptr group); + + std::string backend_; + // TODO: Ask Alex what kind of equality we need. It determine whether we + // need to use ProcessGroup or ProcesGroup* as key. + std::unordered_map< + std::shared_ptr, + std::pair, std::shared_ptr>> + pg_map_; + + // Note, this is different mapping relationship than original Python + // implementation. + std::unordered_map, std::string> pg_names_; + + // Value is global_rank:group_rank mapping. + std::unordered_map, std::vector> + pg_group_ranks_; + + std::shared_ptr default_pg_; + + // Default value should be "env://" + std::string default_pg_init_method_; + + int64_t group_count_; }; - } // namespace c10d diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 38a1811692c2..d15ea9d23412 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -949,6 +949,12 @@ that adds a prefix to each key inserted to the store. .def(py::init<>()) .def_readwrite("is_high_priority", &::c10d::ProcessGroupNCCL::Options::isHighPriorityStream) .def_readwrite("op_timeout", &::c10d::ProcessGroupNCCL::Options::opTimeout); + processGroupNCCL.def_static("_group_start", []() { + ::c10d::ProcessGroupNCCL::groupStart(); + }); + processGroupNCCL.def_static("_group_end", []() { + ::c10d::ProcessGroupNCCL::groupEnd(); + }); #endif #ifdef USE_C10D_MPI diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index f33079bcbab5..a0df3c784778 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 9670968b8fe1..af4e127cc548 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h index 76b8a9a145f1..b306cf04da0a 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.h +++ b/torch/csrc/jit/codegen/cuda/executor_utils.h @@ -1,11 +1,12 @@ #pragma once #include -#include #include #include +#include + #include #include diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp index c5889144bd1f..78d11e79eb03 100644 --- a/torch/csrc/jit/python/python_ir.cpp +++ b/torch/csrc/jit/python/python_ir.cpp @@ -747,7 +747,8 @@ void initPythonIRBindings(PyObject* module_) { py::class_>(m, "FloatType") .def_static("get", &FloatType::get); py::class_>(m, "TensorType") - .def_static("get", &TensorType::get); + .def_static("get", &TensorType::get) + .def_static("getInferred", &TensorType::getInferred); py::class_>(m, "BoolType") .def_static("get", &BoolType::get); py::class_>(m, "StringType") diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py index ba5ec8bdb5fc..44b7876c4787 100644 --- a/torch/distributed/__init__.py +++ b/torch/distributed/__init__.py @@ -25,3 +25,7 @@ def is_available(): # this. from .distributed_c10d import _backend + + # TODO: remove this once CI issue is resolved + # https://github.com/pytorch/pytorch/issues/42517 + from .distributed_c10d import _P2POp, _batch_isend_irecv diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index ae4338cd28fc..a125d8a1204b 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1,6 +1,7 @@ import pickle import torch import warnings +import contextlib from torch._six import string_classes from datetime import timedelta @@ -159,8 +160,7 @@ class GroupMember(object): def _rank_not_in_group(group): """ - Helper that checks if the current process's rank is not in a given group - + Helper that checks if the current process's rank is not in a given group. """ if group == GroupMember.WORLD: return False @@ -170,8 +170,7 @@ def _rank_not_in_group(group): def _get_group_rank(group, rank): """ Helper that gets a given group's local rank in the group from a given global - rank - + rank. """ if group is GroupMember.WORLD: raise RuntimeError("group.WORLD does not have local rank to global " @@ -188,8 +187,7 @@ def _get_group_rank(group, rank): def _get_global_rank(group, group_rank): """ Helper that gets a given group's global rank from a given local rank in the - group - + group. """ if group is GroupMember.WORLD: raise RuntimeError("group.WORLD does not have local rank to global " @@ -204,8 +202,7 @@ def _get_global_rank(group, group_rank): def _check_default_pg(): """ Helper that checks if the default ProcessGroup has been initialized, with - assertion - + assertion. """ assert _default_pg is not None, \ "Default process group is not initialized" @@ -213,8 +210,7 @@ def _check_default_pg(): def _get_group_size(group): """ - Helper that gets a given group's world size - + Helper that gets a given group's world size. """ if group is GroupMember.WORLD: _check_default_pg() @@ -227,7 +223,6 @@ def _get_group_size(group): def _check_single_tensor(param, param_name): """ Helper to check that the parameter ``param_name`` is a single tensor. - """ if not isinstance(param, torch.Tensor): raise RuntimeError("Invalid function argument. Expected parameter `{}` " @@ -237,7 +232,6 @@ def _check_single_tensor(param, param_name): def _check_tensor_list(param, param_name): """ Helper to check that the parameter ``param_name`` is a list of tensors. - """ if not isinstance(param, list) or \ not all(isinstance(p, torch.Tensor) for p in param): @@ -245,10 +239,34 @@ def _check_tensor_list(param, param_name): "to be of type List[torch.Tensor].".format(param_name)) +def _check_op(op): + """ + Helper to check that the ``op`` is either isend or irecv. + """ + if op not in [isend, irecv]: + raise RuntimeError("Invalid ``op``. Expected ``op`` " + "to be of type ``torch.distributed.isend`` or " + "``torch.distributed.irecv``.") + +def _check_p2p_op_list(p2p_op_list): + """ + Helper to check that the ``p2p_op_list`` is a list of _P2POp instances and + all ops use the same backend. + """ + if not isinstance(p2p_op_list, list) or \ + not all(isinstance(p2p_op, _P2POp) for p2p_op in p2p_op_list): + raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to " + "to be of type ``torch.distributed._P2POp``.") + + + backend = get_backend(p2p_op_list[0].group) + if not all(backend == get_backend(p2p_op.group) for p2p_op in p2p_op_list): + raise RuntimeError("All groups need to use the same backend.") + + def is_mpi_available(): """ Checks if the MPI backend is available. - """ return _MPI_AVAILABLE @@ -256,7 +274,6 @@ def is_mpi_available(): def is_nccl_available(): """ Checks if the NCCL backend is available. - """ return _NCCL_AVAILABLE @@ -264,7 +281,6 @@ def is_nccl_available(): def is_gloo_available(): """ Checks if the Gloo backend is available. - """ return _GLOO_AVAILABLE @@ -272,7 +288,6 @@ def is_gloo_available(): def is_initialized(): """ Checking if the default process group has been initialized - """ return _default_pg is not None @@ -280,7 +295,6 @@ def is_initialized(): def _get_default_group(): """ Getting the default process group created by init_process_group - """ if not is_initialized(): raise RuntimeError("Default process group has not been initialized, " @@ -291,7 +305,6 @@ def _get_default_group(): def _get_default_store(): """ Getting the default store created by init_process_group - """ if not is_initialized(): raise RuntimeError("Default process group has not been initialized, " @@ -757,6 +770,94 @@ def recv(tensor, return src +class _P2POp(object): + """ + A class to build point-to-point operations for ``_batch_isend_irecv``. + + This class builds the type of P2P operation, communication buffer, peer rank, + Process Group group, and tag. Instances of this class will be passed to + ``_batch_isend_irecv`` for point-to-point communications. + + Arguments: + op (callable): A function to send data to or receive data from a peer process. + The type of ``op`` is either ``torch.distributed.isend`` or + ``torch.distributed.irecv``. + tensor (Tensor): Tensor to send or receive. + peer (int): Destination or source rank. + group (ProcessGroup, optional): The process group to work on. + tag (int, optional): Tag to match send with recv. + """ + def __init__(self, op, tensor, peer, group=group.WORLD, tag=0): + self.op = op + self.tensor = tensor + self.peer = peer + self.group = group + self.tag = tag + + def __new__(cls, op, tensor, peer, group=group.WORLD, tag=0): + _check_op(op) + _check_single_tensor(tensor, "tensor") + return object.__new__(cls) + + +@contextlib.contextmanager +def _batch_p2p_manager(backend): + if backend == Backend.NCCL: + ProcessGroupNCCL._group_start() + try: + yield + finally: + if backend == Backend.NCCL: + ProcessGroupNCCL._group_end() + + +def _batch_isend_irecv(p2p_op_list): + """ + Send or Receive a batch of tensors asynchronously and return a list of requests. + + Process each of the operations in p2p_op_list and return the corresponding + requests. NCCL and Gloo backend are currently supported. + + Arguments: + p2p_op_list: A list of point-to-point operations(type of each operator is + ``torch.distributed._P2POp``). The order of the isend/irecv in the list + matters and it needs to match with corresponding isend/irecv on the + remote end. + + Returns: + A list of distributed request objects returned by calling the corresponding + op in the op_list. + + Examples: + >>> send_tensor = torch.arange(2) + 2 * rank + >>> recv_tensor = torch.randn(2) + >>> send_op = dist._P2POp(dist.isend, send_tensor, (rank + 1)%world_size) + >>> recv_op = dist._P2POp(dist.irecv, recv_tensor, (rank + 1)%world_size) + >>> reqs = _batch_isend_irecv([send_op, recv_op]) + >>> for req in reqs: + >>> req.wait() + >>> recv_tensor + tensor([2, 3]) # Rank 0 + tensor([0, 1]) # Rank 1 + """ + _check_p2p_op_list(p2p_op_list) + backend = get_backend(p2p_op_list[0].group) + reqs = [] + with _batch_p2p_manager(backend): + for p2p_op in p2p_op_list: + op = p2p_op.op + tensor = p2p_op.tensor + peer = p2p_op.peer + curr_group = p2p_op.group + tag = p2p_op.tag + + ret = op(tensor, peer, curr_group, tag) + + if ret is not None: + reqs.append(ret) + return reqs + + def broadcast_multigpu(tensor_list, src, group=group.WORLD, diff --git a/torch/distributions/categorical.py b/torch/distributions/categorical.py index 01f3dd520174..319d2dd01b66 100644 --- a/torch/distributions/categorical.py +++ b/torch/distributions/categorical.py @@ -16,14 +16,14 @@ class Categorical(Distribution): Samples are integers from :math:`\{0, \ldots, K-1\}` where `K` is ``probs.size(-1)``. - If :attr:`probs` is 1D with length-`K`, each element is the relative + If :attr:`probs` is 1-dimensional with length-`K`, each element is the relative probability of sampling the class at that index. - If :attr:`probs` is 2D, it is treated as a batch of relative probability - vectors. + If :attr:`probs` is N-dimensional, the first N-1 dimensions are treated as a batch of + relative probability vectors. .. note:: :attr:`probs` must be non-negative, finite and have a non-zero sum, - and it will be normalized to sum to 1. + and it will be normalized to sum to 1 along the last dimension. See also: :func:`torch.multinomial` diff --git a/torch/fft/__init__.py b/torch/fft/__init__.py index 3e4bcc35464b..b3ffdb24b4a0 100644 --- a/torch/fft/__init__.py +++ b/torch/fft/__init__.py @@ -2,6 +2,12 @@ import torch from torch._C import _add_docstr, _fft # type: ignore +from torch._torch_docs import factory_common_args + +__all__ = ['fft', 'ifft', 'fftn', 'ifftn', + 'rfft', 'irfft', 'rfftn', 'irfftn', 'hfft', 'ihfft', + 'fftfreq', 'rfftfreq', 'fftshift', 'ifftshift', + 'Tensor'] Tensor = torch.Tensor @@ -549,3 +555,180 @@ tensor([ 2.0000+-0.0000j, -0.5000-0.6882j, -0.5000-0.1625j, -0.5000+0.1625j, -0.5000+0.6882j]) """) + +fftfreq = _add_docstr(_fft.fft_fftfreq, r""" +fftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + +Computes the discrete Fourier Transform sample frequencies for a signal of size :attr:`n`. + +Note: + By convention, :func:`~torch.fft.fft` returns positive frequency terms + first, followed by the negative frequencies in reverse order, so that + ``f[-i]`` for all :math:`0 < i \leq n/2`` in Python gives the negative + frequency terms. For an FFT of length :attr:`n` and with inputs spaced in + length unit :attr:`d`, the frequencies are:: + + f = [0, 1, ..., (n - 1) // 2, -(n // 2), ..., -1] / (d * n) + +Note: + For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as + either negative or positive. :func:`~torch.fft.fftfreq` follows NumPy's + convention of taking it to be negative. + +Args: + n (int): the FFT length + d (float, optional): The sampling length scale. + The spacing between individual samples of the FFT input. + The default assumes unit spacing, dividing that result by the actual + spacing gives the result in physical frequency units. + +Keyword Args: + {dtype} + {layout} + {device} + {requires_grad} + +Example: + + >>> import torch.fft + >>> torch.fft.fftfreq(5) + tensor([ 0.0000, 0.2000, 0.4000, -0.4000, -0.2000]) + + For even input, we can see the Nyquist frequency at ``f[2]`` is given as + negative: + + >>> torch.fft.fftfreq(4) + tensor([ 0.0000, 0.2500, -0.5000, -0.2500]) + +""".format(**factory_common_args)) + +rfftfreq = _add_docstr(_fft.fft_rfftfreq, r""" +rfftfreq(n, d=1.0, *, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor + +Computes the sample frequencies for :func:`~torch.fft.rfft` with a signal of size :attr:`n`. + +Note: + :func:`~torch.fft.rfft` returns Hermitian one-sided output, so only the + positive frequency terms are returned. For a real FFT of length :attr:`n` + and with inputs spaced in length unit :attr:`d`, the frequencies are:: + + f = torch.arange((n + 1) // 2) / (d * n) + +Note: + For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as + either negative or positive. Unlike :func:`~torch.fft.fftfreq`, + :func:`~torch.fft.rfftfreq` always returns it as positive. + +Args: + n (int): the real FFT length + d (float, optional): The sampling length scale. + The spacing between individual samples of the FFT input. + The default assumes unit spacing, dividing that result by the actual + spacing gives the result in physical frequency units. + +Keyword Args: + {dtype} + {layout} + {device} + {requires_grad} + +Example: + + >>> import torch.fft + >>> torch.fft.rfftfreq(5) + tensor([ 0.0000, 0.2000, 0.4000]) + + >>> torch.fft.rfftfreq(4) + tensor([ 0.0000, 0.2500, 0.5000]) + + Compared to the output from :func:`~torch.fft.fftfreq`, we see that the + Nyquist frequency at ``f[2]`` has changed sign: + >>> torch.fft.fftfreq(4) + tensor([ 0.0000, 0.2500, -0.5000, -0.2500]) + +""".format(**factory_common_args)) + +fftshift = _add_docstr(_fft.fft_fftshift, r""" +fftshift(input, dim=None) -> Tensor + +Reorders n-dimensional FFT data, as provided by :func:`~torch.fft.fftn`, to have +negative frequency terms first. + +Note: + By convention, the FFT returns positive frequency terms first, followed by + the negative frequencies in reverse order, so that ``f[-i]`` for all + :math:`0 < i \leq n/2` in Python gives the negative frequency terms. + :func:`~torch.fft.fftshift` rearranges all frequencies into ascending order + from negative to positive with the zero-frequency term in the center. + +Note: + For even lengths, the Nyquist frequency at ``f[n/2]`` can be thought of as + either negative or positive. :func:`~torch.fft.fftshift` always puts the + Nyquist term at the 0-index. This is the same convention used by + :func:`~torch.fft.fftfreq`. + +Args: + input (Tensor): the tensor in FFT order + dim (int, Tuple[int], optional): The dimensions to rearrange. + Only dimensions specified here will be rearranged, any other dimensions + will be left in their original order. + Default: All dimensions of :attr:`input`. + +Example: + + >>> import torch.fft + >>> f = torch.fft.fftfreq(4) + >>> f + tensor([ 0.0000, 0.2500, -0.5000, -0.2500]) + + >>> torch.fftshift(f) + tensor([-0.5000, -0.2500, 0.0000, 0.2500]) + + Also notice that the Nyquist frequency term at ``f[2]`` was moved to the + beginning of the tensor. + + This also works for multi-dimensional transforms: + >>> x = torch.fft.fftfreq(5, d=1/5) + 0.1 * torch.fft.fftfreq(5, d=1/5).unsqueeze(1) + >>> x + tensor([[ 0.0000, 1.0000, 2.0000, -2.0000, -1.0000], + [ 0.1000, 1.1000, 2.1000, -1.9000, -0.9000], + [ 0.2000, 1.2000, 2.2000, -1.8000, -0.8000], + [-0.2000, 0.8000, 1.8000, -2.2000, -1.2000], + [-0.1000, 0.9000, 1.9000, -2.1000, -1.1000]]) + + >>> torch.fft.fftshift(x) + tensor([[-2.2000, -1.2000, -0.2000, 0.8000, 1.8000], + [-2.1000, -1.1000, -0.1000, 0.9000, 1.9000], + [-2.0000, -1.0000, 0.0000, 1.0000, 2.0000], + [-1.9000, -0.9000, 0.1000, 1.1000, 2.1000], + [-1.8000, -0.8000, 0.2000, 1.2000, 2.2000]]) + +""") + +ifftshift = _add_docstr(_fft.fft_ifftshift, r""" +ifftshift(input, dim=None) -> Tensor + +Inverse of :func:`~torch.fft.fftshift`. + +Args: + input (Tensor): the tensor in FFT order + dim (int, Tuple[int], optional): The dimensions to rearrange. + Only dimensions specified here will be rearranged, any other dimensions + will be left in their original order. + Default: All dimensions of :attr:`input`. + +Example: + + >>> import torch.fft + >>> f = torch.fft.fftfreq(5) + >>> f + tensor([ 0.0000, 0.2000, 0.4000, -0.4000, -0.2000]) + + A round-trip through :func:`~torch.fft.fftshift` and + :func:`~torch.fft.ifftshift` gives the same result: + + >>> shifted = torch.fftshift(f) + >>> torch.ifftshift(shifted) + tensor([ 0.0000, 0.2000, 0.4000, -0.4000, -0.2000]) + +""") diff --git a/torch/functional.py b/torch/functional.py index 8eecf3643035..1a72aaf18a30 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -884,7 +884,7 @@ def tensordot(a, b, dims=2): Args: a (Tensor): Left tensor to contract b (Tensor): Right tensor to contract - dims (int or tuple of two lists of integers): number of dimensions to + dims (int or Tuple[List[int]] containing two lists): number of dimensions to contract or explicit lists of dimensions for :attr:`a` and :attr:`b` respectively @@ -919,6 +919,12 @@ def tensordot(a, b, dims=2): [ 3.3161, 0.0704, 5.0187, -0.4079, -4.3126, 4.8744], [ 0.8223, 3.9445, 3.2168, -0.2400, 3.4117, 1.7780]]) + >>> a = torch.randn(3, 5, 4, 6) + >>> b = torch.randn(6, 4, 5, 3) + >>> torch.tensordot(a, b, dims=([2, 1, 3], [1, 2, 0])) + tensor([[ 7.7193, -2.4867, -10.3204], + [ 1.5513, -14.4737, -6.5113], + [ -0.2850, 4.2573, -3.5997]]) """ if not torch.jit.is_scripting(): if (type(a) is not Tensor or type(b) is not Tensor) and has_torch_function((a, b)): diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py index 30e65c191a30..792a905432a5 100644 --- a/torch/fx/__init__.py +++ b/torch/fx/__init__.py @@ -85,6 +85,6 @@ def forward(self, x): from .graph_module import GraphModule from .symbolic_trace import symbolic_trace, Tracer -from .graph import Graph, map_arg -from .node import Node +from .graph import Graph +from .node import Node, map_arg from .proxy import Proxy diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py index 10a0c86e2249..0c5d18aa4fb2 100644 --- a/torch/fx/experimental/GraphManipulation.py +++ b/torch/fx/experimental/GraphManipulation.py @@ -1,8 +1,8 @@ from typing import Dict, List from torch.fx.graph_module import GraphModule from typing import Any -from torch.fx.node import Node, Target -from torch.fx.graph import Graph, map_arg +from torch.fx.node import Node, Target, map_arg +from torch.fx.graph import Graph """find_use is used to find out if the node is another node's arg or kwargs.""" diff --git a/torch/fx/experimental/shape_prop.py b/torch/fx/experimental/shape_prop.py index 01374727a447..52264796c7d4 100644 --- a/torch/fx/experimental/shape_prop.py +++ b/torch/fx/experimental/shape_prop.py @@ -15,7 +15,7 @@ def propagate(self, *args): env : Dict[str, Node] = {} def load_arg(a): - return torch.fx.graph.map_arg(a, lambda n: env[n.name]) + return torch.fx.node.map_arg(a, lambda n: env[n.name]) def fetch_attr(target : str): target_atoms = target.split('.') diff --git a/torch/fx/graph.py b/torch/fx/graph.py index f53f96db0174..ed7618372b57 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -1,4 +1,4 @@ -from .node import Node, Argument, Target +from .node import Node, Argument, Target, map_arg from typing import Callable, Any, List, Dict, Optional, Tuple, Set import builtins @@ -52,23 +52,22 @@ def _format_target(base: str, target: str) -> str: r = f'{r}.{e}' return r -def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument: - """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """ - if isinstance(a, (tuple, list)): - return type(a)(map_arg(elem, fn) for elem in a) - elif isinstance(a, dict): - return {k: map_arg(v, fn) for k, v in a.items()} - elif isinstance(a, slice): - return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn)) - elif isinstance(a, Node): - return fn(a) - else: - return a +class insert_before: + def __init__(self, n : Node): + self.n = n + + def __enter__(self): + self.orig_insert_point = self.n.graph._insert_point + self.n.graph._insert_point = self.n + + def __exit__(self, type, value, tb): + self.n.graph._insert_point = self.orig_insert_point class Graph: def __init__(self): self._nodes : List[Node] = [] self._used_names : Dict[str, int] = {} # base name -> number + self._insert_point : Optional[Node] = None @property def nodes(self): @@ -105,9 +104,38 @@ def create_node(self, op: str, target: Target, self._mark_uses(kwargs) sanitized_name = self._register_name_used(name) if name is not None else self._name(target) n = Node(self, sanitized_name, op, target, args, kwargs) - self._nodes.append(n) + if self._insert_point is not None: + before_idx = self._nodes.index(self._insert_point) + self._nodes.insert(before_idx, n) + else: + self._nodes.append(n) return n + def move_node_before(self, to_move : Node, before : Node): + """ + Move node `to_move` before `before` in the Graph. Both `Node` arguments + must be present in this graph. + """ + # TODO: Computationally inefficient + if to_move.graph != self or before.graph != self: + raise RuntimeError('Node arguments must belong to this Graph!') + node_idx = self._nodes.index(to_move) + before_idx = self._nodes.index(before) + self._nodes.insert(before_idx, self._nodes.pop(node_idx)) + + + def erase_node(self, to_erase : Node): + """ + Erases the node `to_erase` from the `Graph`. Throws an exception if + there are still uses of that node in the `Graph`. + """ + if to_erase.uses > 0: + raise RuntimeError(f'Tried to erase Node {to_erase} but it still had {to_erase.uses} uses in the graph!') + + node_indices = [i for i, n in enumerate(self._nodes) if n == to_erase] + for idx in reversed(node_indices): + self._nodes.pop(idx) + # sugar for above when you know the op def placeholder(self, name: str) -> Node: return self.create_node('placeholder', name) @@ -202,6 +230,7 @@ def _register_name_used(self, op : str) -> str: def python_code(self, root_module: str) -> str: free_vars: List[str] = [] + modules_used : Set[str] = set() body: List[str] = [] for node in self._nodes: if node.op == 'placeholder': @@ -225,6 +254,9 @@ def python_code(self, root_module: str) -> str: body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n') continue qualified_name = _qualified_name(node.target) + if '.' in qualified_name: + module_name = qualified_name.split('.', maxsplit=1)[0] + modules_used.add(module_name) if qualified_name == 'getattr' and \ isinstance(node.args, tuple) and \ isinstance(node.args[1], str) and \ @@ -247,9 +279,12 @@ def python_code(self, root_module: str) -> str: continue raise NotImplementedError(f'node: {node.op} {node.target}') + import_block = '\n'.join(f'import {name}' for name in sorted(modules_used)) + code = ''.join(body) code = '\n'.join(' ' + line for line in code.split('\n')) + '\n' fn_code = f"""\ +{import_block} def forward(self, {', '.join(free_vars)}): {code} """ diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py index bbc5c26a8182..6f72a29be184 100644 --- a/torch/fx/graph_module.py +++ b/torch/fx/graph_module.py @@ -28,9 +28,7 @@ def patched_getline(*args, **kwargs): linecache.getlines = patched_getline def _forward_from_src(src : str): - gbls: Dict[str, Any] = { - 'torch': torch - } + gbls: Dict[str, Any] = {} exec_with_source(src, gbls) return gbls['forward'] @@ -107,7 +105,9 @@ class GraphModule(torch.nn.Module): forward : The Python method generated from `graph` Note that when `graph` is reassigned, `code` and `forward` will be automatically - regenerated. + regenerated. However, if you edit the contents of the `graph` without reassigning + the `graph` attribute itself, you must call `recompile()` to update the generated + code. """ def __new__(cls: 'Type[GraphModule]', *args, **kwargs): # each instance of a graph module needs its own forward method @@ -174,6 +174,14 @@ def graph(self): @graph.setter def graph(self, val) -> None: self._graph = val + self.recompile() + + def recompile(self) -> None: + """ + Recompile this GraphModule from its `graph` attribute. This should be + called after editing the contained `graph`, otherwise the generated + code of this `GraphModule` will be out of date. + """ self.code = self._graph.python_code(root_module='self') cls = type(self) cls.forward = _forward_from_src(self.code) diff --git a/torch/fx/node.py b/torch/fx/node.py index 7bf57cff4dae..53abead5f044 100644 --- a/torch/fx/node.py +++ b/torch/fx/node.py @@ -5,7 +5,6 @@ if TYPE_CHECKING: from .graph import Graph - BaseArgumentTypes = Union[str, int, float, bool, torch.dtype, torch.Tensor] base_types = BaseArgumentTypes.__args__ # type: ignore @@ -35,5 +34,63 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target, self.kwargs = kwargs self.uses = 0 + def find_uses(self) -> List['Node']: + """ + Find all nodes that use the value produced by `self`. The complexity of + this function is linear in the number of nodes * number of arguments to + each node. + + Note that len(find_uses()) is not necessarily equal to attribute `uses`. + This node could be used multiple times in the same `Node`. In that case, + the user node would appear once in the return value here, but `uses` would + account for the total number of times this Node is used by the user node. + e.g. a node for `x + x` would have two uses for the `x` node, but the + `x + x` node would appear once in the return from `find_uses` + """ + use_nodes : List[Node] = [] + for node in self.graph._nodes: + def record_use(arg_node : Node) -> None: + if arg_node == self and (len(use_nodes) == 0 or use_nodes[-1] != node): + use_nodes.append(node) + map_arg(node.args, record_use) + map_arg(node.kwargs, record_use) + return use_nodes + def __repr__(self) -> str: return self.name + + def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']: + """ + Replace all uses of `self` in the Graph with the Node `replace_with`. + Returns the list of nodes on which this change was made. + """ + use_nodes : List[Node] = self.find_uses() + for use_node in use_nodes: + def maybe_replace_node(n : Node) -> Node: + if n == self: + self.uses -= 1 + return replace_with + else: + return n + new_args = map_arg(use_node.args, maybe_replace_node) + assert isinstance(new_args, tuple) + use_node.args = new_args + new_kwargs = map_arg(use_node.kwargs, maybe_replace_node) + assert isinstance(new_kwargs, dict) + use_node.kwargs = new_kwargs + + return use_nodes + + +def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument: + """ apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """ + if isinstance(a, (tuple, list)): + return type(a)(map_arg(elem, fn) for elem in a) + elif isinstance(a, dict): + return {k: map_arg(v, fn) for k, v in a.items()} + elif isinstance(a, slice): + return slice(map_arg(a.start, fn), map_arg(a.stop, fn), map_arg(a.step, fn)) + elif isinstance(a, Node): + return fn(a) + else: + return a diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index d9fce627e52d..81ceea5f58df 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -271,7 +271,7 @@ def get_enum_value_type(e: Type[enum.Enum], loc): def try_ann_to_type(ann, loc): if ann is None: - return TensorType.get() + return TensorType.getInferred() if inspect.isclass(ann) and issubclass(ann, torch.Tensor): return TensorType.get() if is_tuple(ann): diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 952f44161c3f..36dccd04b7e3 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -686,11 +686,10 @@ def build_SliceExpr(ctx, base, slice_expr): return SliceExpr(base.range(), lower, upper, step) def build_Index(ctx, base, index_expr): - if isinstance(index_expr.value, ast.Tuple) or \ - isinstance(index_expr.value, ast.List): + if isinstance(index_expr.value, ast.Tuple): raise NotSupportedError(base.range(), "slicing multiple dimensions with " - "sequences not supported yet") + "tuples not supported yet") return build_expr(ctx, index_expr.value) def build_ExtSlice(ctx, base, extslice): diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 23c1731380f5..6e45b8594f9b 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -107,6 +107,13 @@ std::string getKeyFromDevices(const std::vector& devices) { return deviceList; } +std::string getKeySendRecv(int myRank, int peer) { + int lowRank = myRank < peer ? myRank : peer; + int highRank = myRank < peer ? peer : myRank; + std::string sendRecvPair = std::to_string(lowRank) + ":" + std::to_string(highRank); + return sendRecvPair; +} + // Get the list of devices from list of tensors std::vector getDeviceList(const std::vector& tensors) { std::vector res; @@ -232,6 +239,7 @@ const int64_t ProcessGroupNCCL::kWorkCleanupThreadSleepMillis = 1000; constexpr int64_t kWaitForAbortCommStoreKey = 1000; constexpr int64_t kSynchronizeBusyWaitMillis = 10; const int64_t ProcessGroupNCCL::kProcessGroupNCCLOpTimeoutMillis = 10 * 1000; +thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0; ProcessGroupNCCL::WorkNCCL::WorkNCCL(const std::vector& devices) : devices_(devices), workStartTime_(std::chrono::steady_clock::now()) { @@ -450,6 +458,8 @@ ProcessGroupNCCL::ProcessGroupNCCL( opTimeout_(options.opTimeout), futureNCCLCallbackStreams_(c10::cuda::device_count()), isHighPriorityStream_(options.isHighPriorityStream) { + TORCH_CHECK(at::cuda::getNumGPUs() != 0, + "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"); try { parseNcclBlockingWait(); } catch (std::exception& e) { @@ -715,7 +725,9 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(ncclUniqueId* ncclID) { std::vector>& ProcessGroupNCCL::getNCCLComm( const std::string& devicesKey, - const std::vector& devices) { + const std::vector& devices, + NCCLCommType commType, + int p2pRank) { // Sanity check if (devicesKey.empty()) { throw std::runtime_error( @@ -742,7 +754,8 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( // Create the unique NCCL ID and broadcast it ncclUniqueId ncclID; - if (rank_ == 0) { + // For point-to-point communication, lower rank of the two will get unique id. + if (rank_ == 0 || (commType != NCCLCommType::COLL && p2pRank == 0)) { C10D_NCCL_CHECK(ncclGetUniqueId(&ncclID)); } @@ -754,13 +767,41 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( std::vector streamVal; streamVal.reserve(devices.size()); - // Create the NCCL communicators for each GPU + // [Group Start/End Note] This is used to ensure that nccl communicator will be created + // before communication primitives are called. Let's look at this example: + // Using the batch_isend_irecv to send a tensor to a target process. On the sender side, + // the corresponding underlying NCCL calls will look like + // ncclGroupStart() // This is in batch_isend_irecv + // ncclGroupStart() // This is [Note 1] + // ncclCommInitRank() // Inside NCCLComm::create + // ncclSend() + // ncclGroupEnd() // This is [Note 2] + // ncclGroupEnd() // This is in batch_isend_irecv + // With this pattern, the nccl communicator will be created in the last ncclGroupEnd + // which means when ncclSend is processed, the passed communicator argument is NULL which will + // lead to runtime error. So we need to "close" all active nccl groups to ensure + // nccl communicator is actually created before encountering any communication calls. + // This is why we need the following for loop. + for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) { + C10D_NCCL_CHECK(ncclGroupEnd()); + } + + // [Note 1] Create the NCCL communicators for each GPU C10D_NCCL_CHECK(ncclGroupStart()); for (size_t i = 0; i < devices.size(); ++i) { // GPU world size and GPU rank - int numRanks = getSize() * devices.size(); - int rank = getRank() * devices.size() + i; + int numRanks, rank; + + if (commType == NCCLCommType::COLL) { + numRanks = getSize() * devices.size(); + rank = getRank() * devices.size() + i; + } else { + // For point-to-point operation, there are only 2 processes involved so + // the GPU rank is either 0 or 1. + numRanks = 2; + rank = p2pRank; + } // Get the device index int deviceIndex = devices[i].index(); @@ -779,8 +820,14 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( } } + // [Note 2 ] C10D_NCCL_CHECK(ncclGroupEnd()); + // See [Group Start/End Note] + for (size_t i = 0; i < ncclActiveGroupCounter_; ++i) { + C10D_NCCL_CHECK(ncclGroupStart()); + } + ncclStreams_.emplace(devicesKey, std::move(streamVal)); // Note: these events are created with the (default) cudaEventDisableTiming @@ -1008,6 +1055,77 @@ std::shared_ptr ProcessGroupNCCL::collective( return work; } +template +std::shared_ptr ProcessGroupNCCL::pointToPoint( + std::vector& tensors, + Fn fn, + int peer, + NCCLCommType commType, + PreProcess pre, + PostProcess post) { + const auto devices = getDeviceList(tensors); + const auto key = getKeySendRecv(rank_, peer); + int p2pRank = rank_ < peer ? 0 : 1; + auto& ncclComms = getNCCLComm(key, devices, commType, p2pRank); + + // First let NCCL streams wait for input tensors allocation streams + syncStreams(devices, ncclEvents_[key], ncclStreams_[key]); + + // Work itself will create the CUDA events on all GPUs of tensors + auto work = initWork(devices); + + if (commType == NCCLCommType::RECV) { + // Store references to outputs and futureNCCLCallbackStream to be used by + // WorkNCCL::getFuture. + work->outputs_ = std::make_shared>(tensors); + work->futureNCCLCallbackStreams_ = futureNCCLCallbackStreams_; + } + + at::cuda::OptionalCUDAGuard gpuGuard; + + pre(ncclStreams_[key]); + + for (size_t i = 0; i < tensors.size(); ++i) { + gpuGuard.set_index(devices[i].index()); + at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i]; + + // Both send tensor and recv tensor are created on a worker stream and used in + // different ncclStreams. Hence, both must record the ncclStream to + // prevent being freed before the collective finishes. + // + // See [Sync Streams]. + c10::cuda::CUDACachingAllocator::recordStream( + tensors[i].storage().data_ptr(), ncclStream); + } + + { + AutoNcclGroup nccl_group_guard; + for (size_t i = 0; i < tensors.size(); ++i) { + gpuGuard.set_index(devices[i].index()); + at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i]; + // For point-to-point communication, NCCL ranks can only + // be 0 or 1. + int p2pTargetRank = 1 - p2pRank; + C10D_NCCL_CHECK( + fn(tensors[i], ncclComms[i]->getNcclComm(), ncclStream, p2pTargetRank)); + } + } + + post(ncclStreams_[key]); + + // Event should only be recorded after the ncclGroupEnd() + for (size_t i = 0; i < tensors.size(); ++i) { + at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i]; + (*work->cudaEvents_)[i].record(ncclStream); + work->ncclComms_[i] = ncclComms[i]; + work->blockingWait_ = blockingWait_; + work->opTimeout_ = opTimeout_; + work->store_ = store_; + } + + return work; +} + template std::shared_ptr ProcessGroupNCCL::collective( std::vector& inputs, @@ -1021,6 +1139,21 @@ std::shared_ptr ProcessGroupNCCL::collective( [](std::vector&) {}); } +template +std::shared_ptr ProcessGroupNCCL::pointToPoint( + std::vector& tensor, + Fn fn, + int peer, + NCCLCommType type) { + return pointToPoint( + tensor, + fn, + peer, + type, + [](std::vector&) {}, + [](std::vector&) {}); +} + std::shared_ptr ProcessGroupNCCL::allreduce( std::vector& tensors, const AllreduceOptions& opts) { @@ -1296,6 +1429,54 @@ std::shared_ptr ProcessGroupNCCL::alltoall_base( }); } } + +std::shared_ptr ProcessGroupNCCL::send( + std::vector& tensors, + int dstRank, + int /* unused */) { + check_gpu_tensors(tensors); + auto ret = pointToPoint( + tensors, + [&](at::Tensor& input, + ncclComm_t comm, + at::cuda::CUDAStream& stream, + int dst) { + return ncclSend( + input.data_ptr(), + input.numel(), + getNcclDataType(input.scalar_type()), + dst, + comm, + stream.stream()); + }, + dstRank, + NCCLCommType::SEND); + return ret; +} + +std::shared_ptr ProcessGroupNCCL::recv( + std::vector& tensors, + int srcRank, + int /* unused */) { + check_gpu_tensors(tensors); + auto ret= pointToPoint( + tensors, + [&](at::Tensor& output, + ncclComm_t comm, + at::cuda::CUDAStream& stream, + int src) { + return ncclRecv( + output.data_ptr(), + output.numel(), + getNcclDataType(output.scalar_type()), + src, + comm, + stream.stream()); + }, + srcRank, + NCCLCommType::RECV); + return ret; +} #else std::shared_ptr ProcessGroupNCCL::alltoall_base( at::Tensor& /* unused */, @@ -1306,7 +1487,37 @@ std::shared_ptr ProcessGroupNCCL::alltoall_base( throw std::runtime_error( "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0"); } + +std::shared_ptr ProcessGroupNCCL::send( + std::vector& /* unused */, + int /* unused */, + int /* unused */) { + throw std::runtime_error( + "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0"); +} + +std::shared_ptr ProcessGroupNCCL::recv( + std::vector& /* unused */, + int /* unused */, + int /* unused */) { + throw std::runtime_error( + "ProcessGroupNCCL only supports recv for NCCL lib version >= 2.7.0"); +} +#endif + +void ProcessGroupNCCL::groupStart() { +#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) + C10D_NCCL_CHECK(ncclGroupStart()); #endif + ++ncclActiveGroupCounter_; +} + +void ProcessGroupNCCL::groupEnd() { +#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2) + C10D_NCCL_CHECK(ncclGroupEnd()); +#endif + --ncclActiveGroupCounter_; +} std::shared_ptr ProcessGroupNCCL::alltoall( std::vector& /* unused */, @@ -1329,24 +1540,10 @@ std::shared_ptr ProcessGroupNCCL::scatter( throw std::runtime_error("ProcessGroupNCCL does not support scatter"); } -std::shared_ptr ProcessGroupNCCL::send( - std::vector& /* unused */, - int /* unused */, - int /* unused */) { - throw std::runtime_error("ProcessGroupNCCL does not support send"); -} - -std::shared_ptr ProcessGroupNCCL::recv( - std::vector& /* unused */, - int /* unused */, - int /* unused */) { - throw std::runtime_error("ProcessGroupNCCL does not support recv"); -} - std::shared_ptr ProcessGroupNCCL::recvAnysource( std::vector& /* unused */, int /* unused */) { - throw std::runtime_error("ProcessGroupNCCL does not support recv"); + throw std::runtime_error("ProcessGroupNCCL does not support recvAnysource"); } std::shared_ptr ProcessGroupNCCL::allgather_base( diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 06b144438455..b8b3d5aabd35 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -23,6 +23,13 @@ constexpr const char* NCCL_BLOCKING_WAIT = "NCCL_BLOCKING_WAIT"; // Handling with NCCL. constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING"; +// NCCL Commmunication type +enum class NCCLCommType : std::uint8_t { + SEND = 0, + RECV, + COLL, +}; + // ProcessGroupNCCL implements NCCL bindings for c10d. // // All functions of the class are expected to be called in the same order @@ -420,6 +427,20 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& inputTensors, const AllToAllOptions& opts = AllToAllOptions()) override; + std::shared_ptr send( + std::vector& tensors, + int dstRank, + int tag) override; + + std::shared_ptr recv( + std::vector& tensors, + int srcRank, + int tag) override; + + static void groupStart(); + + static void groupEnd(); + // Unsupported Ops std::shared_ptr gather( std::vector>& outputTensors, @@ -431,16 +452,6 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector>& inputTensors, const ScatterOptions& opts = ScatterOptions()) override; - std::shared_ptr send( - std::vector& tensors, - int dstRank, - int tag) override; - - std::shared_ptr recv( - std::vector& tensors, - int srcRank, - int tag) override; - std::shared_ptr recvAnysource( std::vector& tensors, int tag) override; @@ -455,7 +466,9 @@ class ProcessGroupNCCL : public ProcessGroup { // a new set of NCCL communicators as a cache entry std::vector>& getNCCLComm( const std::string& devicesKey, - const std::vector& devices); + const std::vector& devices, + NCCLCommType commType = NCCLCommType::COLL, + int p2pRank = 0); // Wrapper method which can be overridden for tests. virtual std::exception_ptr checkForNCCLErrors( @@ -484,6 +497,24 @@ class ProcessGroupNCCL : public ProcessGroup { PreProcess pre, PostProcess post); + // Helper that encapsulates work shared across point-to-point communication + // primitives. It is the same structure as the helper used for collective + // communicaiton primitives. + template + std::shared_ptr pointToPoint( + std::vector& tensor, + Fn fn, + int peer, + NCCLCommType commType); + template + std::shared_ptr pointToPoint( + std::vector& tensor, + Fn fn, + int peer, + NCCLCommType commType, + PreProcess pre, + PostProcess post); + // Checks for NCCL errors on each of the communicators and returns an // appropriate exception_ptr (nullptr if no errors). static std::exception_ptr checkForNCCLErrorsInternal( @@ -525,6 +556,8 @@ class ProcessGroupNCCL : public ProcessGroup { uint64_t ncclCommCounter_{0}; // The NCCL communicator that the process group has cached. + // + // For collective operations: // The key is a list of GPU devices that an operation is operating on // The GPU devices are stored in a device sequence and the cache NCCL // communicator is associated with this GPU device sequence @@ -543,6 +576,13 @@ class ProcessGroupNCCL : public ProcessGroup { // "0,4,5,6,7,1,2,3" // // Note that the order of the device for the tensor list matters. + // + // For point-to-point operations: + // The key is a string of my current rank and the peer process rank. + // e.g. If process 1 and process 2 are involved in a point-to-point communication, + // the key will be "1:2" on both processes. + // Note: this is for the scenario where there is only 1 GPU per process. + // When it comes to multiple GPUs per process, this part may need to redesigned. std::unordered_map>> devNCCLCommMap_; @@ -639,6 +679,11 @@ class ProcessGroupNCCL : public ProcessGroup { // Schedule NCCL operations on high priority CUDA streams. bool isHighPriorityStream_ = false; + + // The number of active ncclGroupStart() calls. This counter will be increased + // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd() + // is called. + static thread_local uint64_t ncclActiveGroupCounter_; }; } // namespace c10d diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 30e732e6d859..2facc5e0c6eb 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -349,7 +349,7 @@ def add_module(self, name: str, module: Optional['Module']) -> None: elif hasattr(self, name) and name not in self._modules: raise KeyError("attribute '{}' already exists".format(name)) elif '.' in name: - raise KeyError("module name can't contain \".\"") + raise KeyError("module name can't contain \".\", got: {}".format(name)) elif name == '': raise KeyError("module name can't be empty string \"\"") self._modules[name] = module diff --git a/torch/nn/modules/sparse.py b/torch/nn/modules/sparse.py index b6997ca7701a..f063ffa2e8eb 100644 --- a/torch/nn/modules/sparse.py +++ b/torch/nn/modules/sparse.py @@ -186,11 +186,11 @@ class EmbeddingBag(Module): r"""Computes sums or means of 'bags' of embeddings, without instantiating the intermediate embeddings. - For bags of constant length and no :attr:`per_sample_weights`, this class + For bags of constant length and no :attr:`per_sample_weights` and 2D inputs, this class - * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``, - * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``, - * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``. + * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=1)``, + * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=1)``, + * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=1)``. However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these operations. diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py index 0c7e1f90f47a..852de812e39d 100644 --- a/torch/quantization/fx/fuse.py +++ b/torch/quantization/fx/fuse.py @@ -1,11 +1,9 @@ from torch.fx import ( - GraphModule + GraphModule, + map_arg ) -from torch.fx.graph import ( - Graph, - map_arg, -) +from torch.fx.graph import Graph from .pattern_utils import ( is_match, diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 77a3a47e9dc2..74dee6ea3cf3 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -3,12 +3,12 @@ GraphModule, Proxy, symbolic_trace, + map_arg ) from torch.fx.graph import ( Graph, Node, - map_arg, ) from torch.quantization import ( diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py index b2cd30c66812..4f36b31a23d0 100644 --- a/torch/testing/_internal/common_distributed.py +++ b/torch/testing/_internal/common_distributed.py @@ -130,6 +130,17 @@ def requires_mpi(): "c10d was not compiled with the MPI backend", ) +def skip_if_rocm_single_process(func): + """Skips a test for ROCm in a single process environment""" + func.skip_if_rocm = True + + @wraps(func) + def wrapper(*args, **kwargs): + if not TEST_WITH_ROCM: + return func(*args, **kwargs) + raise unittest.SkipTest("Test skipped for ROCm") + + return wrapper def skip_if_rocm(func): """Skips a test for ROCm""" diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 46ba17f61d8f..f26e6c75d37e 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -373,6 +373,9 @@ def sample_inputs(self, device, dtype, requires_grad=False): SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', device_type='cpu', dtypes=[torch.cfloat, torch.cdouble], active_if=(IS_MACOS or IS_WINDOWS)), + SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', + device_type='cuda', dtypes=[torch.float64], + active_if=TEST_WITH_ROCM), )), UnaryUfuncInfo('tanh', ref=np.tanh, diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index ccbda8232952..468fd9cfdc81 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -653,12 +653,6 @@ def checkGraphModeFxOp(self, model, inputs, quant_type, result = qgraph(*inputs) result_debug = qgraph_debug(*inputs) - # numeric match for debug option for dynamic - # quantized op is not needed right now - if quant_type != QuantType.DYNAMIC: - self.assertEqual((result - result_debug).abs().max(), 0), \ - 'Expecting debug and non-debug option to produce identical result' - qgraph_to_check = qgraph_debug if debug else qgraph if print_debug_info: print() diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 44caea6687f0..9c9d27bf195b 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -147,7 +147,7 @@ def _get_test_report_path(): help='whether to run each test in a subprocess') parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--accept', action='store_true') -parser.add_argument('--ge_config', type=str) +parser.add_argument('--jit_executor', type=str) parser.add_argument('--repeat', type=int, default=1) parser.add_argument('--test_bailouts', action='store_true') parser.add_argument('--save-xml', nargs='?', type=str, @@ -158,11 +158,11 @@ def _get_test_report_path(): parser.add_argument('--run-parallel', type=int, default=1) args, remaining = parser.parse_known_args() -if args.ge_config == 'legacy': +if args.jit_executor == 'legacy': GRAPH_EXECUTOR = ProfilingMode.LEGACY -elif args.ge_config == 'profiling': +elif args.jit_executor == 'profiling': GRAPH_EXECUTOR = ProfilingMode.PROFILING -elif args.ge_config == 'simple': +elif args.jit_executor == 'simple': GRAPH_EXECUTOR = ProfilingMode.SIMPLE else: # infer flags based on the default settings diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 01cddee92365..235e88f3c823 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -34,6 +34,7 @@ skip_if_lt_x_gpu, skip_if_no_gpu, require_n_gpus_for_nccl_backend, + requires_nccl_version, ) from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT @@ -210,10 +211,13 @@ def _lock(): lf.close() -def _build_tensor(size, value=None, dtype=torch.float): +def _build_tensor(size, value=None, dtype=torch.float, device_id=None): if value is None: value = size - return torch.empty(size, size, size, dtype=dtype).fill_(value) + if device_id is None: + return torch.empty(size, size, size, dtype=dtype).fill_(value) + else: + return torch.empty(size, size, size, dtype=dtype).fill_(value).cuda(device_id) def _build_multidim_tensor(dim, dim_size, value=None): @@ -585,6 +589,182 @@ def test_backend_group(self): def test_backend_full_group(self): self._test_group_override_backend(self._init_full_group_test) + # NCCL Batch SEND RECV + @skip_if_no_gpu + @unittest.skip("NCCL P2P is not enabled for OSS builds") + @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only") + @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv") + def test_batch_isend_irecv_nccl(self): + self._barrier() + rank = dist.get_rank() + rank_to_GPU = self._init_multigpu_helper() + device_id = rank_to_GPU[rank][0] + p2p_op_list = [] + + for val in ["1", "0"]: + os.environ["NCCL_BLOCKING_WAIT"] = val + for src in range(0, dist.get_world_size()): + send_tensor = _build_tensor(rank + 1, device_id=device_id) + recv_tensor = _build_tensor(src + 1, value=-1, device_id=device_id) + recv_op = dist._P2POp(dist.irecv, recv_tensor, src) + p2p_op_list.append(recv_op) + send_op = dist._P2POp(dist.isend, send_tensor, src) + p2p_op_list.append(send_op) + + reqs = dist._batch_isend_irecv(p2p_op_list) + for req in reqs: + req.wait() + + self._barrier() + + # GLOO Batch SEND RECV CPU + @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU") + def test_batch_isend_irecv_gloo(self): + self._barrier() + rank = dist.get_rank() + p2p_op_list = [] + + for src in range(0, dist.get_world_size()): + if src == rank: + continue + send_tensor = _build_tensor(rank + 1) + recv_tensor = _build_tensor(src + 1, value=-1) + recv_op = dist._P2POp(dist.irecv, recv_tensor, src) + p2p_op_list.append(recv_op) + send_op = dist._P2POp(dist.isend, send_tensor, src) + p2p_op_list.append(send_op) + + reqs = dist._batch_isend_irecv(p2p_op_list) + for req in reqs: + req.wait() + + self._barrier() + + # GLOO Batch SEND RECV CPU with provided tags + @unittest.skipIf(BACKEND != "gloo", "GLOO Batch Send Recv CPU") + def test_batch_isend_irecv_gloo_tags(self): + self._barrier() + rank = dist.get_rank() + p2p_op_list = [] + + for src in range(0, dist.get_world_size()): + if src == rank: + continue + send_tensor = _build_tensor(rank + 1) + recv_tensor = _build_tensor(src + 1, value=-1) + recv_op = dist._P2POp(dist.irecv, recv_tensor, src, tag=src) + p2p_op_list.append(recv_op) + send_op = dist._P2POp(dist.isend, send_tensor, src, tag=rank) + p2p_op_list.append(send_op) + + reqs = dist._batch_isend_irecv(p2p_op_list) + for req in reqs: + req.wait() + + self._barrier() + + # NCCL Batch SEND RECV Tensor Error + @unittest.skip("NCCL P2P is not enabled for OSS builds") + @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only") + @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv") + def test_batch_isend_irecv_tensor_err(self): + self._barrier() + rank = dist.get_rank() + if rank == 0: + rank_to_GPU = self._init_multigpu_helper() + device_id = rank_to_GPU[rank][0] + with self.assertRaisesRegex( + RuntimeError, "Tensors must be CUDA and dense" + ): + send_tensor = _build_tensor(rank + 1) + send_op = dist._P2POp(dist.isend, send_tensor, 1) + req = dist._batch_isend_irecv([send_op]) + req.wait() + + # NCCL Batch SEND RECV Op Error + @unittest.skip("NCCL P2P is not enabled for OSS builds") + @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only") + @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv") + def test_batch_isend_irecv_op_err(self): + self._barrier() + rank = dist.get_rank() + if rank == 0: + rank_to_GPU = self._init_multigpu_helper() + device_id = rank_to_GPU[rank][0] + with self.assertRaisesRegex( + RuntimeError, "^Invalid ``op``" + ): + send_tensor = _build_tensor(rank + 1, device_id=device_id) + send_op = dist._P2POp(dist.broadcast, send_tensor, 1) + req = dist._batch_isend_irecv([send_op]) + req.wait() + + # NCCL Batch SEND RECV p2p_op_list Error + @unittest.skip("NCCL P2P is not enabled for OSS builds") + @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only") + @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv") + def test_batch_isend_irecv_op_list_err(self): + self._barrier() + rank = dist.get_rank() + if rank == 0: + rank_to_GPU = self._init_multigpu_helper() + device_id = rank_to_GPU[rank][0] + with self.assertRaisesRegex( + RuntimeError, "^Invalid ``p2p_op_list``" + ): + send_tensor = _build_tensor(rank + 1) + req = dist._batch_isend_irecv([1, 2]) + req.wait() + + # NCCL Batch SEND RECV Mixed Backend Error + @unittest.skip("NCCL P2P is not enabled for OSS builds") + @unittest.skipIf(BACKEND != "nccl", "NCCL Batch Send Recv Only") + @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv") + def test_batch_isend_irecv_mixed_backend_err(self): + self._barrier() + rank = dist.get_rank() + rank_to_GPU = self._init_multigpu_helper() + device_id = rank_to_GPU[rank][0] + group_gloo = dist.new_group(ranks=[0, 1], backend="gloo") + group_nccl = dist.new_group(ranks=[0, 1], backend="nccl") + if rank == 0: + with self.assertRaisesRegex( + RuntimeError, "All groups need to use the same backend" + ): + send_tensor = _build_tensor(rank + 1) + send_op_gloo = dist._P2POp(dist.isend, send_tensor, 1, group_gloo) + send_op_nccl = dist._P2POp(dist.isend, send_tensor, 1, group_nccl) + req = dist._batch_isend_irecv([send_op_gloo, send_op_nccl]) + req.wait() + + # NCCL SEND RECV + @unittest.skip("NCCL P2P is not enabled for OSS builds") + @skip_if_no_gpu + @unittest.skipIf(BACKEND != "nccl", "NCCL Send Recv Only") + @requires_nccl_version(2700, "Need NCCL 2.7+ for send/recv") + def test_send_recv_nccl(self): + rank = dist.get_rank() + rank_to_GPU = self._init_multigpu_helper() + device_id = rank_to_GPU[rank][0] + + tensor = _build_tensor(rank + 1, device_id=device_id) + + for src in range(0, dist.get_world_size()): + if src == rank: + # Send mode + for dst in range(0, dist.get_world_size()): + if dst == rank: + continue + dist.send(tensor, dst) + else: + # Recv mode + expected_tensor = _build_tensor(src + 1) + output_tensor = _build_tensor(src + 1, value=-1, device_id=device_id) + dist.recv(output_tensor, src) + self.assertEqual(output_tensor, expected_tensor) + + self._barrier() + # SEND RECV @unittest.skipIf(BACKEND == "nccl", "Nccl does not support send/recv") def test_send_recv(self):