diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index 0b2e60b48f8e..26cc77c8ff9c 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -51,7 +51,14 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then else cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}" fi - retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}" + ( + # For some reason conda likes to re-activate the conda environment when attempting this install + # which means that a deactivate is run and some variables might not exist when that happens, + # namely CONDA_MKL_INTERFACE_LAYER_BACKUP from libblas so let's just ignore unbound variables when + # it comes to the conda installation commands + set +u + retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}" + ) fi elif [[ "$PACKAGE_TYPE" != libtorch ]]; then pip install "\$pkg" diff --git a/.github/workflows/update_s3_htmls.yml b/.github/workflows/update_s3_htmls.yml index 92f9a66a0fd8..f2320ce2fcbf 100644 --- a/.github/workflows/update_s3_htmls.yml +++ b/.github/workflows/update_s3_htmls.yml @@ -9,6 +9,7 @@ on: jobs: update-html: runs-on: ubuntu-latest + if: ${{ github.repository_owner == 'pytorch' }} strategy: matrix: prefix: ["whl", "whl/test", "whl/nightly"] diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md index ea6c6dd40f68..9fd68ecf7f15 100644 --- a/.jenkins/pytorch/README.md +++ b/.jenkins/pytorch/README.md @@ -10,9 +10,9 @@ it is very easy to run these tests yourself: ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``, where ``$BUILD_ENVIRONMENT`` is one of the build environments enumerated in - [pytorch-dockerfiles](https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh) + [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/build.sh). The dockerfile used by jenkins can be found under the `.circle` [directory](https://github.com/pytorch/pytorch/blob/master/.circleci/docker) -2. Run ``docker -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and +2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and run one of the scripts in this directory. The Docker images are designed so that any "reasonable" build commands @@ -38,5 +38,5 @@ mechanisms we use: build scripts. - We reroute well known paths like `/usr/bin/gcc` to alternate - implementations with `update-alternatives, instead of setting + implementations with `update-alternatives`, instead of setting `CC` and `CXX` in our implementations. diff --git a/CMakeLists.txt b/CMakeLists.txt index ba862b5a4d5f..e346087c0cdb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -316,7 +316,7 @@ set(OP_DEPENDENCY "" CACHE STRING # symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk # https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu if(LINUX) - set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed") + set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed ${CMAKE_SHARED_LINKER_FLAGS}") endif() if(MSVC) diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h index ae95ef43f21c..8d29a9204420 100644 --- a/aten/src/ATen/ATen.h +++ b/aten/src/ATen/ATen.h @@ -31,3 +31,4 @@ #include #include #include +#include diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index 419c454257d8..9bdec2dce77e 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -287,6 +287,25 @@ Tensor squeeze_dim_batching_rule(const Tensor& self, int64_t dim) { return self_physical.getPhysicalToLogicalMap().apply(result); } +Tensor trace_batching_rule(const Tensor& self) { + auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); + // Batched Diagonal View + auto self_diag = at::diagonal(self_physical.tensor(), /*offset*/0, /*dim1*/-2, /*dim2*/-1); + auto result = at::sum(self_diag, -1); + return self_physical.getPhysicalToLogicalMap().apply(result); +} + +Tensor trace_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes) { + auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad); + auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options()); + // Batched Diagonal View + auto grad_input_diag = at::diagonal(grad_input, /*offset*/0, /*dim1*/-2, /*dim2*/-1); + // Append a dimension of size one to the grad output + auto grad_physical_tensor = grad_physical.tensor().unsqueeze(-1); + grad_input_diag.copy_(grad_physical_tensor); + return grad_physical.getPhysicalToLogicalMap().apply(grad_input); +} + Tensor transpose_int_batching_rule(const Tensor& self, int64_t dim0, int64_t dim1) { // PyTorch has a special case where scalar_tensor.transpose(dim0, dim1) works // for dim0, dim1 in {0, -1} and returns the scalar tensor. If the following happens: @@ -1029,6 +1048,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("squeeze", squeeze_batching_rule); m.impl("squeeze.dim", squeeze_dim_batching_rule); m.impl("t", native::t); // composite wrt autograd + m.impl("trace", trace_batching_rule); m.impl("transpose.int", transpose_int_batching_rule); m.impl("unbind.int", unbind_batching_rule); m.impl("unfold", unfold_batching_rule); @@ -1089,6 +1109,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { #undef TO_BATCHING_RULE m.impl("clone", clone_batching_rule); + using TensorTensorScalarType = Tensor (*)(const Tensor&, const Tensor&, Scalar); using TensorTensorType = Tensor (*)(const Tensor&, const Tensor&); using TensorScalarType = Tensor (*)(const Tensor&, Scalar); @@ -1115,6 +1136,12 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("pow.Scalar", pow_scalar_Tensor_batching_rule); m.impl("sigmoid_backward", binary_pointwise_batching_rule); + m.impl( + "threshold_backward", + binary_pointwise_batching_rule< + TensorTensorScalarType, + at::threshold_backward, + Scalar>); // for at::result_type, call the native::result_type implementation. // We don't have to do anything special because native::result_type operates @@ -1150,6 +1177,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { // backward operators m.impl("select_backward", select_backward_batching_rule); m.impl("slice_backward", slice_backward_batching_rule); + m.impl("trace_backward", trace_backward_batching_rule); m.impl("diagonal_backward", diagonal_backward_batching_rule); // Tensor.new_* operators diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 41252609953f..2e663b4f48dd 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -93,26 +93,6 @@ inline constexpr bool should_include_kernel_dtype( return __VA_ARGS__(); \ } -// This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and -// should be removed once the bfloat16 bringup is complete on other platforms. -// This is supposed to be used as a wrapper around the lambda function passed to -// the dispatch macro and will conditionally dispatch ops with bfloat16 type -// only on ROCm. -#if !defined(__HIP_PLATFORM_HCC__) -#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) \ - if (std::is_same::value) { \ - AT_ERROR( \ - #NAME, \ - " not implemented for '", \ - toString(at::ScalarType::BFloat16), \ - "'"); \ - } else { \ - return __VA_ARGS__(); \ - } -#else -#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) return __VA_ARGS__() -#endif - namespace detail { inline at::ScalarType scalar_type(at::ScalarType s) { diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp index 07fc4e279557..261f6cdd46b5 100644 --- a/aten/src/ATen/ParallelOpenMP.cpp +++ b/aten/src/ATen/ParallelOpenMP.cpp @@ -1,4 +1,5 @@ #include +#include #if AT_PARALLEL_OPENMP #include diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 3890662123a2..f6c3bbbe09cc 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -10,6 +10,8 @@ // There is some back story, see https://github.com/pytorch/pytorch/issues/48684 #include +#include + namespace at { namespace indexing { @@ -261,14 +263,15 @@ static inline void recordTensorIndex(const Tensor& tensor, std::vector& (*dim_ptr)++; }; -static inline std::vector typeConvertIndices(const Tensor& self, std::vector&& indices) { - std::vector converted_inds(indices.size()); +static inline c10::List> typeConvertIndices(const Tensor& self, std::vector&& indices) { + c10::List> converted_inds; + converted_inds.reserve(indices.size()); for (size_t i = 0; i < indices.size(); ++i) { const auto &ind = indices[i]; if (ind.defined()) { - converted_inds[i] = ind.to(ind.options().device(self.device())); + converted_inds.push_back(ind.to(ind.options().device(self.device()))); } else { - converted_inds[i] = std::move(indices[i]); + converted_inds.push_back(std::move(indices[i])); } } return converted_inds; diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 8c82f965ef0f..dfb8e3ac0f32 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -406,7 +406,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional), promote) KERNEL(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote) KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote) - KERNEL_UNBOXED_ONLY(ADD_NS(index_put), "index_put", Tensor (const Tensor &, TensorList, const Tensor &, bool), promote) + KERNEL(ADD_NS(index_put), "index_put", Tensor (const Tensor &, const torch::List>&, const Tensor &, bool), promote) KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote) KERNEL(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote) diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h index 40f733784fe5..f911722c51e1 100644 --- a/aten/src/ATen/core/List.h +++ b/aten/src/ATen/core/List.h @@ -243,7 +243,7 @@ class List final { * Example: * List a({2, 3, 4}); */ - explicit List(std::initializer_list initial_values); + List(std::initializer_list initial_values); explicit List(ArrayRef initial_values); /** diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h index 3cbd7a310275..ab3ddae55770 100644 --- a/aten/src/ATen/core/List_inl.h +++ b/aten/src/ATen/core/List_inl.h @@ -1,7 +1,7 @@ #pragma once +#include #include -#include namespace c10 { @@ -50,7 +50,17 @@ List::List(TypePtr elementType) namespace impl { template List toTypedList(impl::GenericList list) { - TORCH_INTERNAL_ASSERT(*getTypePtr() == *list.impl_->elementType, "Tried to cast a List<", toString(list.impl_->elementType), "> to a List<", toString(getTypePtr()), ">. Types mismatch."); + // If there's other instances of the list (i.e. list.use_count() > 1), then we have to be invariant + // because upcasting would allow people to add types into the new list that would break the old list. + // However, if there aren't any other instances of this list (i.e. list.use_count() == 1), then we can + // allow upcasting. This can be a perf improvement since we can cast List to List> + // without having to copy it. This is also used to provide backwards compatibility with some old models + // that serialized the index arguments to aten::index, aten::index_put, aten::index_put_ and aten::index_put_impl_ + // as List before we changed that argument to be List>. When deserializing, we + // have list.use_count() == 1 and can deserialize the List directly as List>. + TORCH_CHECK(*list.impl_->elementType == *getTypePtr() + || (list.use_count() == 1 && list.impl_->elementType->isSubtypeOf(getTypePtr())) + , "Tried to cast a List<", toString(list.impl_->elementType), "> to a List<", toString(getTypePtr()), ">. Types mismatch."); return List(std::move(list.impl_)); } @@ -312,3 +322,5 @@ void List::unsafeSetElementType(TypePtr t) { impl_->elementType = std::move(t); } } + +#include diff --git a/aten/src/ATen/core/Variadic.h b/aten/src/ATen/core/Variadic.h index b49d94bba1c8..d33f3d575177 100644 --- a/aten/src/ATen/core/Variadic.h +++ b/aten/src/ATen/core/Variadic.h @@ -6,6 +6,7 @@ #include #include +#include namespace at { @@ -56,6 +57,15 @@ struct IterArgs { } } + template + void operator()(const torch::List& args) { + for (const auto& arg : args) { + self()(arg); + if (self().short_circuit()) + return; + } + } + // NB: we need to specify std::vector manually as C++ won't // do an implicit conversion to make a template deduction go through. template diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index 8bfb4f7e9d16..adeaa1039638 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -101,8 +101,17 @@ struct BuiltinOpFunction : public Function { } std::string pretty_print_schema() const override { + #ifdef __NVCC__ + // Disable the "statement is unreachable" warning + #pragma diag_suppress code_is_unreachable + #endif + TORCH_INTERNAL_ASSERT(false); return ""; + + #ifdef __NVCC__ + #pragma diag_default code_is_unreachable + #endif } Function& setSchema(c10::FunctionSchema schema) override { diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 15a71d99a91f..a65a48d601dc 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -17,6 +17,7 @@ namespace c10 { #define FORALL_NS_SYMBOLS(_) \ _(namespaces, prim) \ _(namespaces, aten) \ + _(namespaces, cuda) \ _(namespaces, onnx) \ _(namespaces, attr) \ _(namespaces, scope) \ @@ -284,6 +285,9 @@ namespace c10 { _(aten, zero_) \ _(aten, fill_) \ _(aten, masked_fill_) \ + _(cuda, _set_device) \ + _(cuda, set_stream) \ + _(cuda, _current_device) \ _(aten, swapaxes) \ _(aten, swapaxes_) \ _(aten, swapdims) \ @@ -384,6 +388,7 @@ namespace c10 { #define FORALL_NS_SYMBOLS(_) \ _(namespaces, prim) \ _(namespaces, aten) \ + _(namespaces, cuda) \ _(namespaces, onnx) \ _(namespaces, attr) \ _(namespaces, scope) \ @@ -454,6 +459,7 @@ struct TORCH_API Symbol { // (and if it's not, you should add it to the built-ins list above.) static Symbol attr(const std::string & s); static Symbol aten(const std::string & s); + static Symbol cuda(const std::string & s); static Symbol onnx(const std::string & s); static Symbol prim(const std::string & s); static Symbol user(const std::string & s); @@ -464,6 +470,7 @@ struct TORCH_API Symbol { bool is_attr() const; bool is_aten() const; + bool is_cuda() const; bool is_prim() const; bool is_onnx() const; bool is_user() const; @@ -524,6 +531,7 @@ FORALL_NS_SYMBOLS(DEFINE_SYMBOL) inline Symbol Symbol::attr(const std::string & s) { return Symbol::fromQualString("attr::" + s); } inline Symbol Symbol::aten(const std::string & s) { return Symbol::fromQualString("aten::" + s); } +inline Symbol Symbol::cuda(const std::string & s) { return Symbol::fromQualString("cuda::" + s); } inline Symbol Symbol::onnx(const std::string & s) { return Symbol::fromQualString("onnx::" + s); } inline Symbol Symbol::prim(const std::string & s) { return Symbol::fromQualString("prim::" + s); } inline Symbol Symbol::scope(const std::string & s) { return Symbol::fromQualString("scope::" + s); } @@ -532,6 +540,7 @@ inline Symbol Symbol::caffe2(const std::string & s) { return Symbol::fromQualStr inline Symbol Symbol::dimname(const std::string & s) { return Symbol::fromQualString("dimname::" + s); } inline bool Symbol::is_attr() const { return ns() == namespaces::attr; } inline bool Symbol::is_aten() const { return ns() == namespaces::aten; } +inline bool Symbol::is_cuda() const { return ns() == namespaces::cuda; } inline bool Symbol::is_prim() const { return ns() == namespaces::prim; } inline bool Symbol::is_onnx() const { return ns() == namespaces::onnx; } inline bool Symbol::is_user() const { return ns() == namespaces::user; } diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index f6902cd4beb6..a3ae813616e0 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -1,10 +1,11 @@ #pragma once +#include #include #include #include -#include #include +#include #include #include @@ -17,197 +18,17 @@ struct ClassType; namespace torch { namespace jit { struct CompilationUnit; +struct Function; } // namespace jit } // namespace torch namespace c10 { +struct IValue; struct FunctionSchema; struct NamedType; using OptNameList = c10::optional>; -#define C10_FORALL_TYPES(_) \ - _(AnyType) \ - _(EnumType) \ - _(AnyEnumType) \ - _(TensorType) \ - _(StorageType) \ - _(TupleType) \ - _(ListType) \ - _(DictType) \ - _(NumberType) \ - _(FloatType) \ - _(FutureType) \ - _(RRefType) \ - _(IntType) \ - _(NoneType) \ - _(StringType) \ - _(GeneratorType) \ - _(QuantizerType) \ - _(BoolType) \ - _(OptionalType) \ - _(VarType) \ - _(DeviceObjType) \ - _(StreamObjType) \ - _(FunctionType) \ - _(ClassType) \ - _(PyObjectType) \ - _(CapsuleType) \ - _(InterfaceType) \ - _(QSchemeType) \ - _(LayoutType) \ - _(ScalarTypeType) \ - _(AnyListType) \ - _(AnyTupleType) \ - _(AnyClassType) - -enum class TypeKind { -#define DEFINE_TYPE(T) T, - C10_FORALL_TYPES(DEFINE_TYPE) -#undef DEFINE_TYPE -}; - -TORCH_API const char* typeKindToString(TypeKind kind); - -struct Type; -using TypePtr = std::shared_ptr; -using ConstTypePtr = std::shared_ptr; - -// Use this to customize how a Type is printed using `annotation_str()`. If -// c10::nullopt is returned, `annotation_str()` falls through to its default -// implementation. -using TypePrinter = - std::function(const ConstTypePtr&)>; - -struct TORCH_API Type : std::enable_shared_from_this { - private: - TypeKind kind_; - - protected: - Type(TypeKind kind) : kind_(kind) {} - - virtual std::string annotation_str_impl(TypePrinter printer) const { - return str(); - } - - public: - virtual bool operator==(const Type& rhs) const = 0; - - // subtyping relation. By default, we return true for the case - // when the type is exactly equal or if this <: T where rhs = Optional[T] - - // if this returns false and the why_not stream is non-null, it contains - // additional details that describe why this is not a subtype of 'rhs'. - // This additional information should only contain details that are not obvious - // from the annotation_str() that describes the type. For instance it is clear that `int <: str` is false - // but not clear why `Foo <: InterfaceBar` might be false. - virtual bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const; - virtual bool is_module() const; - bool isSubtypeOf(const TypePtr& rhs) const { - return isSubtypeOfExt(rhs, nullptr); - } - - // How this type will appear in FunctionSchema declarations - virtual std::string str() const = 0; - - // How this type will appear as if it were a type annotation in Python - // which is sometimes different than how it appears in declarations (e.g. - // int[] vs List[int]) - // - // Takes a custom printer that users can pass in to customize the output of - // this method. - std::string annotation_str(TypePrinter printer) const { - if (printer) { - // the printer can return nullopt to fall through to the default impl - if (auto renamed = printer(shared_from_this())) { - return *renamed; - } - } - return annotation_str_impl(printer); - } - std::string annotation_str() const { - // Overload instead of define a default value for `printer` to help - // debuggers out. - return annotation_str(nullptr); - } - - // Returns a human readable string that includes additional information like - // "type is inferred rather than explictly defined" to help construct more - // user-friendly messages. - virtual std::string repr_str() const { - return annotation_str(); - } - - TypeKind kind() const { - return kind_; - } - - virtual bool requires_grad() const { - for (const auto& ct : containedTypes()) { - if (ct->requires_grad()) { - return true; - } - } - return false; - } - - // Dynamically cast this object to the subclass indicated by the - // template variable, returning nullptr if the cast is invalid. - template - std::shared_ptr cast() { - if (T::Kind == kind()) { - return std::static_pointer_cast(shared_from_this()); - } - return nullptr; - } - template - std::shared_ptr cast() const { - if (T::Kind == kind()) { - return std::static_pointer_cast(shared_from_this()); - } - return nullptr; - } - template - std::shared_ptr expect() { - auto r = cast(); - AT_ASSERT(r); - return r; - } - template - std::shared_ptr expect() const { - auto r = cast(); - AT_ASSERT(r); - return r; - } - virtual ~Type() = default; - virtual bool hasFreeVariables() const { - return false; - } - // list of types this type contains, e.g. for a List then element type of a - // list for a tuple, the types of the tuple elements - virtual at::ArrayRef containedTypes() const { - return {}; - } - // create a new version of this type, replacing its contained types with - // contained_types - TypePtr withContained(std::vector contained_types) { - auto current_contained = containedTypes(); - AT_ASSERT(current_contained.size() == contained_types.size()); - if (current_contained.equals(contained_types)) { - return shared_from_this(); - } - return createWithContained(std::move(contained_types)); - } - // per-type constructor, you only need to override this if the - // containedTypes() is not empty - virtual TypePtr createWithContained( - std::vector contained_types) const { - AT_ERROR( - "type with contained types did not overload createWithContained: ", - str()); - } -}; - struct AnyType; using AnyTypePtr = std::shared_ptr; // Any is the top of the type hierarchy, all other types are subtypes diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h new file mode 100644 index 000000000000..37da9ad7ef8d --- /dev/null +++ b/aten/src/ATen/core/jit_type_base.h @@ -0,0 +1,195 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +#define C10_FORALL_TYPES(_) \ + _(AnyType) \ + _(EnumType) \ + _(AnyEnumType) \ + _(TensorType) \ + _(StorageType) \ + _(TupleType) \ + _(ListType) \ + _(DictType) \ + _(NumberType) \ + _(FloatType) \ + _(FutureType) \ + _(RRefType) \ + _(IntType) \ + _(NoneType) \ + _(StringType) \ + _(GeneratorType) \ + _(QuantizerType) \ + _(BoolType) \ + _(OptionalType) \ + _(VarType) \ + _(DeviceObjType) \ + _(StreamObjType) \ + _(FunctionType) \ + _(ClassType) \ + _(PyObjectType) \ + _(CapsuleType) \ + _(InterfaceType) \ + _(QSchemeType) \ + _(LayoutType) \ + _(ScalarTypeType) \ + _(AnyListType) \ + _(AnyTupleType) \ + _(AnyClassType) + +enum class TypeKind { +#define DEFINE_TYPE(T) T, + C10_FORALL_TYPES(DEFINE_TYPE) +#undef DEFINE_TYPE +}; + +TORCH_API const char* typeKindToString(TypeKind kind); + +struct Type; +using TypePtr = std::shared_ptr; +using ConstTypePtr = std::shared_ptr; + +// Use this to customize how a Type is printed using `annotation_str()`. If +// c10::nullopt is returned, `annotation_str()` falls through to its default +// implementation. +using TypePrinter = + std::function(const ConstTypePtr&)>; + +struct TORCH_API Type : std::enable_shared_from_this { + private: + TypeKind kind_; + + protected: + Type(TypeKind kind) : kind_(kind) {} + + virtual std::string annotation_str_impl(TypePrinter printer) const { + return str(); + } + + public: + virtual bool operator==(const Type& rhs) const = 0; + + // subtyping relation. By default, we return true for the case + // when the type is exactly equal or if this <: T where rhs = Optional[T] + + // if this returns false and the why_not stream is non-null, it contains + // additional details that describe why this is not a subtype of 'rhs'. + // This additional information should only contain details that are not obvious + // from the annotation_str() that describes the type. For instance it is clear that `int <: str` is false + // but not clear why `Foo <: InterfaceBar` might be false. + virtual bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const; + virtual bool is_module() const; + bool isSubtypeOf(const TypePtr& rhs) const { + return isSubtypeOfExt(rhs, nullptr); + } + + // How this type will appear in FunctionSchema declarations + virtual std::string str() const = 0; + + // How this type will appear as if it were a type annotation in Python + // which is sometimes different than how it appears in declarations (e.g. + // int[] vs List[int]) + // + // Takes a custom printer that users can pass in to customize the output of + // this method. + std::string annotation_str(TypePrinter printer) const { + if (printer) { + // the printer can return nullopt to fall through to the default impl + if (auto renamed = printer(shared_from_this())) { + return *renamed; + } + } + return annotation_str_impl(printer); + } + std::string annotation_str() const { + // Overload instead of define a default value for `printer` to help + // debuggers out. + return annotation_str(nullptr); + } + + // Returns a human readable string that includes additional information like + // "type is inferred rather than explictly defined" to help construct more + // user-friendly messages. + virtual std::string repr_str() const { + return annotation_str(); + } + + TypeKind kind() const { + return kind_; + } + + virtual bool requires_grad() const { + for (const auto& ct : containedTypes()) { + if (ct->requires_grad()) { + return true; + } + } + return false; + } + + // Dynamically cast this object to the subclass indicated by the + // template variable, returning nullptr if the cast is invalid. + template + std::shared_ptr cast() { + if (T::Kind == kind()) { + return std::static_pointer_cast(shared_from_this()); + } + return nullptr; + } + template + std::shared_ptr cast() const { + if (T::Kind == kind()) { + return std::static_pointer_cast(shared_from_this()); + } + return nullptr; + } + template + std::shared_ptr expect() { + auto r = cast(); + AT_ASSERT(r); + return r; + } + template + std::shared_ptr expect() const { + auto r = cast(); + AT_ASSERT(r); + return r; + } + virtual ~Type() = default; + virtual bool hasFreeVariables() const { + return false; + } + // list of types this type contains, e.g. for a List then element type of a + // list for a tuple, the types of the tuple elements + virtual at::ArrayRef containedTypes() const { + return {}; + } + // create a new version of this type, replacing its contained types with + // contained_types + TypePtr withContained(std::vector contained_types) { + auto current_contained = containedTypes(); + AT_ASSERT(current_contained.size() == contained_types.size()); + if (current_contained.equals(contained_types)) { + return shared_from_this(); + } + return createWithContained(std::move(contained_types)); + } + // per-type constructor, you only need to override this if the + // containedTypes() is not empty + virtual TypePtr createWithContained( + std::vector contained_types) const { + AT_ERROR( + "type with contained types did not overload createWithContained: ", + str()); + } +}; + +} diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 00424ab83ba0..f38860e8ef13 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -21,7 +21,7 @@ #endif #ifdef USE_MAGMA -#include +#include #endif #ifdef __HIP_PLATFORM_HCC__ diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index 9103eafb1f12..d4b514f6797b 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -71,7 +71,7 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) { } TORCH_IMPL_FUNC(add_out) ( - Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha + const Tensor& self, const Tensor& other, Scalar alpha, Tensor& result ) { add_stub(device_type(), *this, alpha); TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype()); @@ -1109,11 +1109,11 @@ Tensor& xlogy_out(Tensor& result, const Tensor& self, const Tensor& other) { } Tensor& xlogy_out(Tensor& result, Scalar self, const Tensor& other) { - return at::xlogy_out(result, c10::scalar_to_tensor(self, other.device()), other); + return at::xlogy_out(result, wrapped_scalar_tensor(self), other); } Tensor& xlogy_out(Tensor& result, const Tensor& self, Scalar other) { - return at::xlogy_out(result, self, c10::scalar_to_tensor(other, self.device())); + return at::xlogy_out(result, self, wrapped_scalar_tensor(other)); } Tensor xlogy(const Tensor& x, const Tensor& y) { @@ -1124,11 +1124,11 @@ Tensor xlogy(const Tensor& x, const Tensor& y) { } Tensor xlogy(Scalar x, const Tensor& y) { - return at::xlogy(c10::scalar_to_tensor(x, y.device()), y); + return at::xlogy(wrapped_scalar_tensor(x), y); } Tensor xlogy(const Tensor& x, Scalar y) { - return at::xlogy(x, c10::scalar_to_tensor(y, x.device())); + return at::xlogy(x, wrapped_scalar_tensor(y)); } Tensor& xlogy_(Tensor& x, const Tensor& y) { @@ -1136,7 +1136,7 @@ Tensor& xlogy_(Tensor& x, const Tensor& y) { } Tensor& xlogy_(Tensor& x, Scalar y) { - return at::xlogy_out(x, x, c10::scalar_to_tensor(y, x.device())); + return at::xlogy_out(x, x, wrapped_scalar_tensor(y)); } } // namespace native diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index bf74e8b356c7..a4854e1ced4d 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -68,7 +68,7 @@ Tensor embedding_sparse_backward( Tensor indices = indices_; Tensor grad = grad_; if (padding_idx != -1) { - auto c = indices != padding_idx; + torch::List> c({indices != padding_idx}); indices = indices.index(c); grad = grad.index(c); } diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h index 94d61b02dd0b..92f6957f25ad 100644 --- a/aten/src/ATen/native/IndexingUtils.h +++ b/aten/src/ATen/native/IndexingUtils.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include @@ -15,40 +16,45 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, } -static std::vector expandTensors(const Tensor & self, TensorList indices) { +static std::vector expandTensors(const Tensor & self, const torch::List>& indices) { // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors std::vector result; - for (const auto & index : indices) { - if (index.scalar_type() == kByte || index.scalar_type() == kBool) { - if (index.scalar_type() == kByte) { - TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \ - " please use a dtype torch.bool instead."); - } - // The sizes of the ByteTensor mask or bool tensor must match the sizes of the - // corresponding dimensions in self - for (int64_t j = 0; j < index.dim(); j++) { - int64_t srcIdx = result.size() + j; - if (index.size(j) != self.size(srcIdx)) { - invalid_mask(self, srcIdx, index, j); + for (c10::optional index_opt : indices) { + if (!index_opt.has_value()) { + result.emplace_back(); + } else { + Tensor index = std::move(*index_opt); + if (index.scalar_type() == kByte || index.scalar_type() == kBool) { + if (index.scalar_type() == kByte) { + TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \ + " please use a dtype torch.bool instead."); } + // The sizes of the ByteTensor mask or bool tensor must match the sizes of the + // corresponding dimensions in self + for (int64_t j = 0; j < index.dim(); j++) { + int64_t srcIdx = result.size() + j; + if (index.size(j) != self.size(srcIdx)) { + invalid_mask(self, srcIdx, index, j); + } + } + // Replace with nonzeros + auto nonzero = index.nonzero(); + for (int64_t j = 0; j < index.dim(); j++) { + result.emplace_back(nonzero.select(1, j)); + } + } else { + result.emplace_back(std::move(index)); } - // Replace with nonzeros - auto nonzero = index.nonzero(); - for (int64_t j = 0; j < index.dim(); j++) { - result.emplace_back(nonzero.select(1, j)); - } - } else { - result.emplace_back(index); } } return result; } -static void checkIndexTensorTypes(TensorList indices) { - for (auto& tensor : indices) { - if (tensor.defined()) { - auto scalarType = tensor.scalar_type(); +static void checkIndexTensorTypes(const torch::List>& indices) { + for (c10::optional tensor : indices) { + if (tensor.has_value() && tensor->defined()) { + auto scalarType = tensor->scalar_type(); if (scalarType != kLong && scalarType != kByte && scalarType != kBool) { TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors"); } @@ -56,6 +62,15 @@ static void checkIndexTensorTypes(TensorList indices) { } } +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const Tensor& a : list) { + result.push_back(a); + } + return result; +} + static bool hasContiguousSubspace(TensorList tl) { // true if all the non-null tensors are adjacent auto isDefined = [](const Tensor & tensor){ return tensor.defined(); }; diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index da8d2bd6db47..a37d1046bac2 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +74,8 @@ Tensor logdet(const Tensor& self) { // U is singular when U(i, i) = 0 for some i in [1, self.size(-1)]. Tensor logdet_vals = diag_U.abs_().log_().sum(-1); if (self.dim() > 2) { - logdet_vals.index_put_((det_sign < 0).nonzero_numpy(), at::full({}, NAN, self.options())); + auto indices = toListOfOptionalTensors((det_sign < 0).nonzero_numpy()); + logdet_vals.index_put_(std::move(indices), at::full({}, NAN, self.options())); } else if (det_sign.item() < 0) { logdet_vals.fill_(NAN); } diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp index bfc5f910e093..4d1601d3e6a0 100644 --- a/aten/src/ATen/native/Pow.cpp +++ b/aten/src/ATen/native/Pow.cpp @@ -31,11 +31,9 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) { "result type ", common_dtype, "can't be cast to the desired output type ", result.scalar_type()); - auto exponent = (exp.isComplex()) ? exp.toComplexDouble() : exp.toDouble(); - - if (exponent == 0.0) { + if (exp.equal(0.0)) { result.resize_as_(base).fill_(1); - } else if (exponent == 1.0) { + } else if (exp.equal(1.0)) { result.resize_as_(base).copy_(base); } else { auto iter = TensorIterator::unary_op(result, base.to(common_dtype)); diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md index 6f17ac860cf8..6e7664c1e1a5 100644 --- a/aten/src/ATen/native/README.md +++ b/aten/src/ATen/native/README.md @@ -335,22 +335,31 @@ set of reviewers. ### `use_c10_dispatcher` ``` -use_c10_dispatcher: 'with_codegenerated_unboxing_wrapper' -use_c10_dispatcher: 'hacky_wrapper_for_legacy_signatures' use_c10_dispatcher: 'full' +use_c10_dispatcher: 'hacky_wrapper_for_legacy_signatures' ``` This will indicate the level of integration with the c10 dispatcher. -If setting this to 'full' works for your operator, please do. -This will enabled the full templated boxing and unboxing for your operator. -Some ops use features that aren't supported by those templates yet, -and enabling `use_c10_dispatcher: full` for those will result in a compiler error. -For those, use `use_c10_dispatcher: 'with_codegenerated_unboxing_wrapper'` instead, -or just omit the argument because 'with_codegenerated_unboxing_wrapper' is the default. -`use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` is similar to `full` -but adds a wrapper around the kernel before registering it with the dispatcher -to support some legacy function signatures for kernels that we didn't migrate to -the new signatures yet. +For any new ops, please set this to 'full'. This is also the default, +so you can just omit it. +This requires the operator function signature to be aligned with the +function schema in native_functions.yaml, i.e. +- out arguments have to be in the end of the argument list instead of in the beginning +- TensorOptions are taken as separate arguments +``` + const c10::optional& dtype, + const c10::optional& layout, + const c10::optional& device, + const c10::optional& pin_memory +``` + instead of one `TensorOptions` argument +- optional tensors are taken as `const c10::optional&` instead of `Tensor` +Some of our kernels are still written in a legacy way, not doing those things, +and need an adapter to work with the dispatcher calling convention. For those, we use +`use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` to codegenerate a corresponding +adapter around them in the operator registration call. Over time, we will migrate all +those kernels to the new calling convention and hacky_wrapper will die. +Please don't use it for new operators. ### `manual_kernel_registration` diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index c8eb3cc99a01..289d1128d2f9 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -102,9 +102,12 @@ Tensor resize_fft_input(Tensor x, IntArrayRef dims, IntArrayRef sizes) { } // Complex to real FFT -Tensor fft_c2r(Tensor input, c10::optional n_opt, +Tensor fft_c2r(c10::string_view function_name, + Tensor out, Tensor input, c10::optional n_opt, int64_t unwrapped_dim, c10::optional norm_str, bool forward) { + TORCH_CHECK(!out.defined() || out.is_floating_point(), function_name, + " expects a floating point output tensor, but got ", out.scalar_type()); input = promote_tensor_fft(input, /*require_complex=*/true); const auto input_dim = input.dim(); const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim); @@ -118,14 +121,22 @@ Tensor fft_c2r(Tensor input, c10::optional n_opt, // FIXME: _fft does not support complex_output=false with inverse=false input = at::conj(input); } - return at::_fft_c2r(input, dim, static_cast(norm), n); + if (out.defined()) { + return at::_fft_c2r_out(out, input, dim, static_cast(norm), n); + } else { + return at::_fft_c2r(input, dim, static_cast(norm), n); + } } // Real to complex FFT -Tensor fft_r2c(Tensor input, c10::optional n_opt, +Tensor fft_r2c(c10::string_view function_name, + Tensor out, Tensor input, c10::optional n_opt, int64_t unwrapped_dim, c10::optional norm_str, bool forward, bool onesided) { - TORCH_CHECK(!input.is_complex(), "Expected a real input tensor to FFT"); + TORCH_CHECK(!input.is_complex(), function_name, + " expects a real input tensor, but got ", input.scalar_type()); + TORCH_CHECK(!out.defined() || out.is_complex(), function_name, + " expects a complex output tensor, but got ", out.scalar_type()); input = promote_tensor_fft(input); const auto input_dim = input.dim(); const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim); @@ -136,19 +147,29 @@ Tensor fft_r2c(Tensor input, c10::optional n_opt, } const auto norm = norm_from_string(norm_str, forward); - auto out = at::_fft_r2c(input, dim, static_cast(norm), onesided); + + Tensor ret; + if (out.defined() && forward) { + ret = at::_fft_r2c_out(out, input, dim, static_cast(norm), onesided); + } else { + ret = at::_fft_r2c(input, dim, static_cast(norm), onesided); + } + if (!forward) { // FIXME: _fft_r2c doesn't support native r2c IFFT - out = at::conj(out); + return out.defined() ? at::conj_out(out, ret) : at::conj(ret); + } else { + return ret; } - return out; } // Complex to complex FFT -Tensor fft_c2c(Tensor input, c10::optional n_opt, +Tensor fft_c2c(c10::string_view function_name, + Tensor out, Tensor input, c10::optional n_opt, int64_t unwrapped_dim, c10::optional norm_str, bool forward) { - TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); + TORCH_CHECK(input.is_complex(), function_name, + " expects a complex input tensor, but got ", input.scalar_type()); const auto input_dim = input.dim(); const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim); const auto n = n_opt.value_or(input.sizes()[dim]); @@ -157,7 +178,13 @@ Tensor fft_c2c(Tensor input, c10::optional n_opt, input = resize_fft_input(input, dim, n); } const auto norm = norm_from_string(norm_str, forward); - return at::_fft_c2c(input, dim, static_cast(norm), forward); + if (out.defined()) { + TORCH_CHECK(out.is_complex(), function_name, + " expects a complex output tensor, but got ", out.scalar_type()); + return at::_fft_c2c_out(out, input, dim, static_cast(norm), forward); + } else { + return at::_fft_c2c(input, dim, static_cast(norm), forward); + } } // Dimensions to transform, and the signal shape in those dimensions @@ -230,12 +257,18 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args( // Complex to complex n-dimensional fft Tensor fftn_c2c( - const Tensor& input, IntArrayRef shape, IntArrayRef dim, - c10::optional norm_str, bool forward) { - TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); + c10::string_view function_name, + Tensor out, const Tensor& input, IntArrayRef shape, + IntArrayRef dim, c10::optional norm_str, bool forward) { + TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got", input.scalar_type()); Tensor x = resize_fft_input(input, dim, shape); const auto norm = norm_from_string(norm_str, forward); - return at::_fft_c2c(x, dim, static_cast(norm), forward); + if (out.defined()) { + TORCH_CHECK(out.is_complex(), function_name, " expects a complex output tensor, but got ", out.scalar_type()); + return at::_fft_c2c_out(out, x, dim, static_cast(norm), forward); + } else { + return at::_fft_c2c(x, dim, static_cast(norm), forward); + } } } // namespace (anonymous) @@ -244,35 +277,79 @@ Tensor fftn_c2c( Tensor fft_fft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { return self.is_complex() ? - fft_c2c(self, n, dim, norm, /*forward=*/true) : - fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false); + fft_c2c("fft", {}, self, n, dim, norm, /*forward=*/true) : + fft_r2c("fft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/false); +} + +Tensor& fft_fft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + if (self.is_complex()) { + fft_c2c("fft", out, self, n, dim, norm, /*forward=*/true); + } else { + fft_r2c("fft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/false); + } + return out; } Tensor fft_ifft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { return self.is_complex() ? - fft_c2c(self, n, dim, norm, /*forward=*/false) : - fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false); + fft_c2c("ifft", {}, self, n, dim, norm, /*forward=*/false) : + fft_r2c("ifft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/false); +} + +Tensor& fft_ifft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + if (self.is_complex()) { + fft_c2c("ifft", out, self, n, dim, norm, /*forward=*/false); + } else { + fft_r2c("ifft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/false); + } + return out; } Tensor fft_rfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/true); + return fft_r2c("rfft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/true); +} + +Tensor& fft_rfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_r2c("rfft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/true); + return out; } Tensor fft_irfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_c2r(self, n, dim, norm, /*forward=*/false); + return fft_c2r("irfft", {}, self, n, dim, norm, /*forward=*/false); +} + +Tensor& fft_irfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_c2r("irfft", out, self, n, dim, norm, /*forward=*/false); + return out; } Tensor fft_hfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_c2r(self, n, dim, norm, /*forward=*/true); + return fft_c2r("hfft", {}, self, n, dim, norm, /*forward=*/true); +} + +Tensor& fft_hfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_c2r("hfft", out, self, n, dim, norm, /*forward=*/true); + return out; } Tensor fft_ihfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true); + return fft_r2c("ihfft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/true); +} + +Tensor& fft_ihfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_r2c("ihfft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/true); + return out; } Tensor fft_fftn(const Tensor& self, c10::optional s, @@ -281,7 +358,18 @@ Tensor fft_fftn(const Tensor& self, c10::optional s, auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); // TODO: For real input, perform rfftn then mirror with conjugate symmetry Tensor input = promote_tensor_fft(self, /*require_complex=*/true); - return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true); + return fftn_c2c("fftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/true); +} + +Tensor& fft_fftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + // TODO: For real input, perform rfftn then mirror with conjugate symmetry + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + fftn_c2c("fftn", out, input, desc.shape, desc.dim, norm, /*forward=*/true); + return out; } Tensor fft_ifftn(const Tensor& self, c10::optional s, @@ -289,24 +377,55 @@ Tensor fft_ifftn(const Tensor& self, c10::optional s, c10::optional norm) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); Tensor input = promote_tensor_fft(self, /*require_complex=*/true); - return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false); + return fftn_c2c("ifftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/false); } -Tensor fft_rfftn(const Tensor& self, c10::optional s, - c10::optional dim, - c10::optional norm_str) { +Tensor& fft_ifftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + fftn_c2c("ifftn", out, input, desc.shape, desc.dim, norm, /*forward=*/false); + return out; +} + +static Tensor fft_rfftn_impl(Tensor out, const Tensor& self, + c10::optional s, + c10::optional dim, + const c10::optional& norm_str) { TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type()); auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis"); Tensor input = promote_tensor_fft(self, /*require_complex=*/false); Tensor x = resize_fft_input(input, desc.dim, desc.shape); const auto norm = norm_from_string(norm_str, /*forward=*/true); - return at::_fft_r2c(x, desc.dim, static_cast(norm), /*onesided=*/true); + if (out.defined()) { + TORCH_CHECK(out.is_complex(), "rfftn expects a complex-valued output tensor, but got ", out.scalar_type()); + return at::_fft_r2c_out(out, x, desc.dim, static_cast(norm), /*onesided=*/true); + } else { + return at::_fft_r2c(x, desc.dim, static_cast(norm), /*onesided=*/true); + } } -Tensor fft_irfftn(const Tensor& self, c10::optional s, +Tensor fft_rfftn(const Tensor& self, c10::optional s, c10::optional dim, c10::optional norm_str) { + return fft_rfftn_impl({}, self, s, dim, norm_str); +} + +Tensor& fft_rfftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm_str) { + fft_rfftn_impl(out, self, s, dim, norm_str); + return out; +} + +static Tensor fft_irfftn_impl(Tensor out, const Tensor& self, + c10::optional s, + c10::optional dim, + const c10::optional& norm_str) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis"); @@ -323,7 +442,27 @@ Tensor fft_irfftn(const Tensor& self, c10::optional s, Tensor input = promote_tensor_fft(self, /*require_complex=*/true); Tensor x = resize_fft_input(input, desc.dim, desc.shape); const auto norm = norm_from_string(norm_str, /*forward=*/false); - return at::_fft_c2r(x, desc.dim, static_cast(norm), last_dim_size); + if (out.defined()) { + TORCH_CHECK(out.is_floating_point(), "irfftn expects a floating point output tensor, but got ", out.scalar_type()); + return at::_fft_c2r_out(out, x, desc.dim, static_cast(norm), last_dim_size); + } else { + return at::_fft_c2r(x, desc.dim, static_cast(norm), last_dim_size); + } +} + +Tensor fft_irfftn(const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm_str) { + return fft_irfftn_impl({}, self, s, dim, norm_str); +} + +Tensor& fft_irfftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm_str) { + fft_irfftn_impl(out, self, s, dim, norm_str); + return out; } Tensor fft_fft2(const Tensor& self, c10::optional s, @@ -331,41 +470,69 @@ Tensor fft_fft2(const Tensor& self, c10::optional s, return native::fft_fftn(self, s, dim, std::move(norm)); } +Tensor& fft_fft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_fftn_out(out, self, s, dim, std::move(norm)); +} + Tensor fft_ifft2(const Tensor& self, c10::optional s, IntArrayRef dim, c10::optional norm) { return native::fft_ifftn(self, s, dim, std::move(norm)); } +Tensor& fft_ifft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_ifftn_out(out, self, s, dim, std::move(norm)); +} + Tensor fft_rfft2(const Tensor& self, c10::optional s, IntArrayRef dim, c10::optional norm) { return native::fft_rfftn(self, s, dim, std::move(norm)); } +Tensor& fft_rfft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_rfftn_out(out, self, s, dim, std::move(norm)); +} + Tensor fft_irfft2(const Tensor& self, c10::optional s, IntArrayRef dim, c10::optional norm) { return native::fft_irfftn(self, s, dim, std::move(norm)); } -Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) { - ScalarType dtype = typeMetaToScalarType(options.dtype()); +Tensor& fft_irfft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_irfftn_out(out, self, s, dim, std::move(norm)); +} + +Tensor& fft_fftfreq_out(Tensor& out, int64_t n, double d) { + ScalarType dtype = out.scalar_type(); TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), "fftfreq requires a floating point or complex dtype"); // TODO: arange doesn't have complex support - Tensor result = native::arange(n, options); - auto right_slice = result.slice(0, (n + 1) / 2, 0); + at::arange_out(out, n); + auto right_slice = out.slice(0, (n + 1) / 2, 0); at::arange_out(right_slice, -(n/2), 0, 1); - result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) - return result; + return out.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) } -Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) { - ScalarType dtype = typeMetaToScalarType(options.dtype()); +Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) { + auto out = at::empty({n}, options); + return native::fft_fftfreq_out(out, n, d); +} + +Tensor& fft_rfftfreq_out(Tensor& out, int64_t n, double d) { + ScalarType dtype = out.scalar_type(); TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), "rfftfreq requires a floating point or complex dtype"); // TODO: arange doesn't have complex support - Tensor result = native::arange(n/2 + 1, options); - result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) - return result; + native::arange_out(out, n/2 + 1); + return out.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) +} + +Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) { + auto out = at::empty({n/2 + 1}, options); + return native::fft_rfftfreq_out(out, n, d); } // If an array dim is specified, wraps them according to self.dim(). @@ -469,18 +636,20 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop const bool return_complex = return_complexOpt.value_or( self.is_complex() || (window.defined() && window.is_complex())); if (!return_complex) { - TORCH_CHECK(return_complexOpt.has_value(), - "stft requires the return_complex parameter be given for real inputs." - "You should pass return_complex=True to opt-in to complex dtype returns " - "(which will be required in a future pytorch release). " + if (!return_complexOpt.has_value()) { + TORCH_WARN_ONCE( + "stft will soon require the return_complex parameter be given for real inputs, " + "and will further require that return_complex=True in a future PyTorch release." ); + } - TORCH_WARN_ONCE( - "stft with return_complex=False is deprecated. In a future pytorch " - "release, stft will return complex tensors for all inputs, and " - "return_complex=False will raise an error.\n" - "Note: you can still call torch.view_as_real on the complex output to " - "recover the old return format."); + + // TORCH_WARN_ONCE( + // "stft with return_complex=False is deprecated. In a future pytorch " + // "release, stft will return complex tensors for all inputs, and " + // "return_complex=False will raise an error.\n" + // "Note: you can still call torch.view_as_real on the complex output to " + // "recover the old return format."); } if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) { diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 1d9f9d9d2a12..3ced0cf5eb52 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -206,7 +206,7 @@ AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list) } } -static AdvancedIndex make_info(Tensor self, TensorList orig) { +static AdvancedIndex make_info(Tensor self, const torch::List>& orig) { checkIndexTensorTypes(orig); // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors auto indices = expandTensors(self, orig); @@ -281,7 +281,7 @@ static TensorIterator make_index_out_iterator(const AdvancedIndex& info, Tensor& return config.build(); } -Tensor index(const Tensor & self, TensorList indices) { +Tensor index(const Tensor & self, const torch::List>& indices) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); auto info = make_info(self, indices); @@ -290,7 +290,7 @@ Tensor index(const Tensor & self, TensorList indices) { return iter.output(); } -Tensor quantized_index(const Tensor & self, TensorList indices) { +Tensor quantized_index(const Tensor & self, const torch::List>& indices) { TORCH_INTERNAL_ASSERT( self.qscheme() == c10::kPerTensorAffine || self.qscheme() == c10::kPerTensorSymmetric, @@ -311,12 +311,14 @@ Tensor quantized_index(const Tensor & self, TensorList indices) { res, self.q_scale(), self.q_zero_point(), self.scalar_type()); } -Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) { +Tensor& index_out(Tensor& result, const Tensor & self, const torch::List>& indices) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); at::assert_no_internal_overlap(result); at::assert_no_overlap(result, self); - for (auto& index: indices) { - at::assert_no_overlap(result, index); + for (const c10::optional& index: indices) { + if (index.has_value()) { + at::assert_no_overlap(result, *index); + } } auto info = make_info(self, indices); @@ -325,11 +327,11 @@ Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) { return result; } -Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value, bool accumulate) { +Tensor index_put(const Tensor & self, const torch::List>& indices, const Tensor & value, bool accumulate) { return self.clone(at::MemoryFormat::Preserve).index_put_(indices, value, accumulate); } -Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate, const bool unsafe) { +Tensor & _index_put_impl_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate, const bool unsafe) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); if (at::has_internal_overlap(self) == MemOverlap::YES) { TORCH_WARN( @@ -338,8 +340,10 @@ Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & valu "This also applies to advanced indexing e.g. tensor[indices] = tensor"); } at::assert_no_overlap(self, value); - for (auto& index: indices) { - at::assert_no_overlap(self, index); + for (const c10::optional& index: indices) { + if (index.has_value()) { + at::assert_no_overlap(self, *index); + } } if (accumulate && self.device().type() == kCUDA) { @@ -356,7 +360,7 @@ Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & valu } -Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate) { +Tensor & index_put_(Tensor & self, const torch::List>& indices, const Tensor & value, const bool accumulate) { return at::_index_put_impl_(self, indices, value, accumulate, /*unsafe=*/false); } @@ -467,7 +471,7 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T // explicitly capture all required variables to work around windows build // TODO: fix this when windows can correctly capture variables in nested lambda - AT_DISPATCH_ALL_TYPES(self.scalar_type(), "index_add_", [&self, &source, &dim, &index_contig, &numel] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "index_add_", [&self, &source, &dim, &index_contig, &numel] { auto self_stride = self.dim() == 0 ? 1 : self.stride(dim); auto source_stride = source.dim() == 0 ? 1 : source.stride(dim); // TODO: Maybe TensorAccessor can beused here? @@ -678,7 +682,7 @@ Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim TORCH_CHECK(result.dim() <= 1, "result.dim() (", result.dim(), ") must one or zero for given self.dim() (", self.dim(), ")"); // explicitly capture all required variables to work around windows build // TODO: fix this when windows can correctly capture variables in nested lambda - AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "index_select", + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, self.scalar_type(), "index_select", [&index_contig, &self, &result, &dim, &numel] { auto self_stride = self.dim() == 0 ? 1 : self.stride(dim); auto result_stride = result.dim() == 0 ? 1 : result.stride(dim); diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h index 560b46162546..0e0958606de1 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.h +++ b/aten/src/ATen/native/TensorAdvancedIndexing.h @@ -15,7 +15,7 @@ enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY}; using index_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides); using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate); -using index_put_accum_fn = void(*)(Tensor &, TensorList , const Tensor &, bool unsafe); +using index_put_accum_fn = void(*)(Tensor &, const c10::List> &, const Tensor &, bool unsafe); using masked_fill_fn = void(*)(TensorIterator &, Scalar scalar); using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride); @@ -42,6 +42,6 @@ DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub); DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub); DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub); -TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices); +TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List>& indices); }} // namespace at::native diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index b27a995962b4..5435f5042ce0 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -38,6 +38,8 @@ Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type()); TORCH_CHECK(!(self.is_complex() && equal_nan), "isclose with equal_nan=True is not supported for complex inputs."); + TORCH_CHECK(!(self.is_quantized() || other.is_quantized()), + "isclose is not supported for quantized inputs."); // Checks that rtol and atol are non-negative // Note: consistent with Python's isclose but divergent from NumPy's, which diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index f8ba5527e5a9..09d50356abd9 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1467,15 +1468,25 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) { return std::make_tuple(sizes, strides); } -std::tuple, std::vector > +namespace { +// Named type instead of a pair/tuple so that we can be sure to +// construct the vectors in place and get NRVO. +struct InferUnsqueezeGeometryResult { + c10::SmallVector sizes; + c10::SmallVector strides; + InferUnsqueezeGeometryResult(IntArrayRef tensor_sizes, IntArrayRef tensor_strides) + : sizes(tensor_sizes.begin(), tensor_sizes.end()) + , strides(tensor_strides.begin(), tensor_strides.end()) {} +}; +} +InferUnsqueezeGeometryResult inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { - auto sizes = tensor.sizes().vec(); - auto strides = tensor.strides().vec(); - int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim]; - sizes.insert(sizes.begin() + dim, 1); - strides.insert(strides.begin() + dim, new_stride); + InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides()); + int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim]; + result.sizes.insert(result.sizes.begin() + dim, 1); + result.strides.insert(result.strides.begin() + dim, new_stride); - return std::make_tuple(sizes, strides); + return result; } Tensor squeeze_qtensor(const Tensor& self) { @@ -1624,7 +1635,7 @@ Tensor unsqueeze_qtensor(const Tensor& self, int64_t dim) { axis, quantizer->scalar_type()); } - return make_qtensor(self, std::get<0>(g), std::get<1>(g), quantizer); + return make_qtensor(self, g.sizes, g.strides, quantizer); } Tensor unsqueeze(const Tensor& self, int64_t dim) { @@ -1636,7 +1647,7 @@ Tensor unsqueeze(const Tensor& self, int64_t dim) { return unsqueeze_qtensor(self, dim); } else { auto g = inferUnsqueezeGeometry(self, dim); - return self.as_strided(std::get<0>(g), std::get<1>(g)); + return self.as_strided(g.sizes, g.strides); } } @@ -1644,7 +1655,7 @@ Tensor & unsqueeze_(Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, self.dim() + 1); auto g = inferUnsqueezeGeometry(self, dim); - return self.as_strided_(std::get<0>(g), std::get<1>(g)); + return self.as_strided_(g.sizes, g.strides); } Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) { diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index fdee519c4bd0..5c6ab40b0ad4 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -73,7 +73,7 @@ Tensor flip_cpu(const Tensor& self, IntArrayRef dims) { ); }); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, in_tensor.scalar_type(), "flip_cpu", [&] { flip_cpu_kernel( diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp index 0ebdce6795aa..7a6f7c6e8e05 100644 --- a/aten/src/ATen/native/TestOps.cpp +++ b/aten/src/ATen/native/TestOps.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace at { namespace native { @@ -50,5 +51,22 @@ Tensor _test_string_default(const Tensor& dummy, std::string a, std::string b) { return dummy; } +// Test that overloads with ambiguity created by defaulted parameters work. +// The operator declared first should have priority always + +// Overload a +Tensor _test_ambiguous_defaults(const Tensor& dummy, int64_t a, int64_t b) { + TORCH_CHECK(a == 1); + TORCH_CHECK(b == 1); + return c10::scalar_to_tensor(1); +} + +// Overload b +Tensor _test_ambiguous_defaults(const Tensor& dummy, int64_t a, std::string b) { + TORCH_CHECK(a == 2); + TORCH_CHECK(b == "2"); + return c10::scalar_to_tensor(2); +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index e6dd1bc4afde..0f6da7e4292a 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -326,8 +326,12 @@ Tensor& reciprocal_out(Tensor& result, const Tensor& self) { return unary_op_imp Tensor reciprocal(const Tensor& self) { return unary_op_impl_float(self, reciprocal_stub); } Tensor& reciprocal_(Tensor& self) { return unary_op_impl_(self, at::reciprocal_out); } -Tensor& rsqrt_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, rsqrt_stub); } -Tensor rsqrt(const Tensor& self) { return unary_op_impl(self, at::rsqrt_out); } +Tensor& rsqrt_out(Tensor& result, const Tensor& self) { + return unary_op_impl_float_out(result, self, rsqrt_stub); +} +Tensor rsqrt(const Tensor& self) { + return unary_op_impl_float(self, rsqrt_stub); +} Tensor& rsqrt_(Tensor& self) { return unary_op_impl_(self, at::rsqrt_out); } Tensor& sign_out(Tensor& result, const Tensor& self) { diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp index b9dd52dffa5d..6478bbb58eaf 100644 --- a/aten/src/ATen/native/UpSampleNearest1d.cpp +++ b/aten/src/ATen/native/UpSampleNearest1d.cpp @@ -66,19 +66,21 @@ TORCH_META_FUNC(upsample_nearest1d_backward) ( namespace native { TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) ( - Tensor& output, const Tensor& input, IntArrayRef output_size, - c10::optional scales) { + c10::optional scales, + Tensor& output +) { upsample_nearest1d_kernel(kCPU, output, input, scales); } TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cpu) ( - Tensor& grad_input, const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales) { + c10::optional scales, + Tensor& grad_input +) { grad_input.zero_(); upsample_nearest1d_backward_kernel(kCPU, grad_input, grad_output, scales); } diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp index b7ec099a80da..6f0d153e978a 100644 --- a/aten/src/ATen/native/cpu/PowKernel.cpp +++ b/aten/src/ATen/native/cpu/PowKernel.cpp @@ -63,7 +63,7 @@ void pow_tensor_scalar_kernel(TensorIterator& iter, Scalar exp_scalar) { ); } else if (exp == -0.5) { cpu_kernel_vec(iter, - [](scalar_t base) -> scalar_t { + [](scalar_t base) __ubsan_ignore_float_divide_by_zero__ -> scalar_t { return 1.0 / std::sqrt(base); }, [](Vec base) -> Vec { return base.rsqrt(); } diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 5f96e01ab319..32033abcd4e2 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -225,7 +225,7 @@ static void norm_kernel_tensor_iterator_impl( binary_kernel_reduce( iter, AbsMaxOps(), - std::numeric_limits::min() + acc_t(0) ); }); } else if (val == -INFINITY) { diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 049b3eff6b5b..32ebaf7752f7 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -587,10 +587,10 @@ static void random_full_64_bits_range_kernel(TensorIterator& iter, c10::optional } static void rsqrt_kernel(TensorIterator& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "rsqrt_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "rsqrt_cpu", [&] { cpu_kernel_vec( iter, - [=](scalar_t a) -> scalar_t { + [=](scalar_t a) __ubsan_ignore_float_divide_by_zero__ -> scalar_t { return (static_cast(1)) / std::sqrt(a); }, [=](Vec256 a) { return a.rsqrt(); }); diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index 7e6384c44b24..3fbd693d17b1 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -16,8 +16,8 @@ #include // for USE_MAGMA #ifdef USE_MAGMA -#include #include +#include const bool use_magma_ = true; #else @@ -95,10 +95,18 @@ void magmaCholeskyBatched( magma_uplo_t uplo, magma_int_t n, scalar_t** dA_array, magma_int_t ldda, magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue); -template +template void magmaTriangularSolve( - magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, - scalar_t* dA, magma_int_t ldda, scalar_t* dB, magma_int_t lddb); + magma_uplo_t uplo, + magma_trans_t trans, + magma_diag_t diag, + magma_int_t m, + magma_int_t n, + scalar_t* dA, + magma_int_t ldda, + scalar_t* dB, + magma_int_t lddb, + const MAGMAQueue& magma_queue); template void magmaTriangularSolveBatched( @@ -662,45 +670,117 @@ void magmaCholeskyBatched>( AT_CUDA_CHECK(cudaGetLastError()); } -template<> +template <> void magmaTriangularSolve( - magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, - double* dA, magma_int_t ldda, double* dB, magma_int_t lddb) { - MagmaStreamSyncGuard guard; - magma_dtrsm(MagmaLeft, uplo, trans, diag, m, n, 1, dA, ldda, dB, lddb); + magma_uplo_t uplo, + magma_trans_t trans, + magma_diag_t diag, + magma_int_t m, + magma_int_t n, + double* dA, + magma_int_t ldda, + double* dB, + magma_int_t lddb, + const MAGMAQueue& magma_queue) { + magma_dtrsm( + MagmaLeft, + uplo, + trans, + diag, + m, + n, + 1, + dA, + ldda, + dB, + lddb, + magma_queue.get_queue()); AT_CUDA_CHECK(cudaGetLastError()); } -template<> +template <> void magmaTriangularSolve( - magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, - float* dA, magma_int_t ldda, float* dB, magma_int_t lddb) { - MagmaStreamSyncGuard guard; - magma_strsm(MagmaLeft, uplo, trans, diag, m, n, 1, dA, ldda, dB, lddb); + magma_uplo_t uplo, + magma_trans_t trans, + magma_diag_t diag, + magma_int_t m, + magma_int_t n, + float* dA, + magma_int_t ldda, + float* dB, + magma_int_t lddb, + const MAGMAQueue& magma_queue) { + magma_strsm( + MagmaLeft, + uplo, + trans, + diag, + m, + n, + 1, + dA, + ldda, + dB, + lddb, + magma_queue.get_queue()); AT_CUDA_CHECK(cudaGetLastError()); } -template<> +template <> void magmaTriangularSolve>( - magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, - c10::complex* dA, magma_int_t ldda, c10::complex* dB, magma_int_t lddb) { - MagmaStreamSyncGuard guard; + magma_uplo_t uplo, + magma_trans_t trans, + magma_diag_t diag, + magma_int_t m, + magma_int_t n, + c10::complex* dA, + magma_int_t ldda, + c10::complex* dB, + magma_int_t lddb, + const MAGMAQueue& magma_queue) { magmaDoubleComplex alpha({1, 0}); - magma_ztrsm(MagmaLeft, uplo, trans, diag, m, n, alpha, - reinterpret_cast(dA), ldda, - reinterpret_cast(dB), lddb); + magma_ztrsm( + MagmaLeft, + uplo, + trans, + diag, + m, + n, + alpha, + reinterpret_cast(dA), + ldda, + reinterpret_cast(dB), + lddb, + magma_queue.get_queue()); AT_CUDA_CHECK(cudaGetLastError()); } -template<> +template <> void magmaTriangularSolve>( - magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, - c10::complex* dA, magma_int_t ldda, c10::complex* dB, magma_int_t lddb) { - MagmaStreamSyncGuard guard; + magma_uplo_t uplo, + magma_trans_t trans, + magma_diag_t diag, + magma_int_t m, + magma_int_t n, + c10::complex* dA, + magma_int_t ldda, + c10::complex* dB, + magma_int_t lddb, + const MAGMAQueue& magma_queue) { magmaFloatComplex alpha({1, 0}); - magma_ctrsm(MagmaLeft, uplo, trans, diag, m, n, alpha, - reinterpret_cast(dA), ldda, - reinterpret_cast(dB), lddb); + magma_ctrsm( + MagmaLeft, + uplo, + trans, + diag, + m, + n, + alpha, + reinterpret_cast(dA), + ldda, + reinterpret_cast(dB), + lddb, + magma_queue.get_queue()); AT_CUDA_CHECK(cudaGetLastError()); } @@ -1636,11 +1716,14 @@ AT_ERROR("triangular_solve: MAGMA library not found in " magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)"); magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount"); + MAGMAQueue magma_queue(b.get_device()); + // batch_size == 1 implies that: // 1. the RHS and LHS tensors have 2 dimensions, or // 2. the RHS and LHS tensors have more than 2 dimensions but all batch dimensions are 1 if (batch_size == 1) { - magmaTriangularSolve(uplo, trans, diag, n, nrhs, A_data, n, b_data, n); + magmaTriangularSolve( + uplo, trans, diag, n, nrhs, A_data, n, b_data, n, magma_queue); } else { auto A_mat_stride = matrixStride(A); auto b_mat_stride = matrixStride(b); diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu index 2379877e91ba..bc1884d8d642 100644 --- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu @@ -32,7 +32,7 @@ void mse_kernel_cuda(TensorIterator& iter) { void xlogy_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "xlogy_cuda", [&]() { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t { + gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t { if (at::_isnan(y)){ return NAN; } diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index cb4aa644fee2..d88f202487af 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -190,7 +190,7 @@ static Tensor & masked_select_out_cuda_impl(Tensor & result, const Tensor & self Tensor _mask = (mask.dim() == 0) ? mask.unsqueeze(0) : mask; Tensor _self = (self.dim() == 0) ? self.unsqueeze(0) : self; std::tie(_mask, _self) = expand_outplace(_mask, _self); - at::native::index_out(result, _self, _mask); + at::native::index_out(result, _self, c10::List>({_mask})); return result; } diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index d630d727019f..2dc04ed4ddef 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -160,7 +160,7 @@ computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) { } -static std::tuple> makeLinearIndex(Tensor self, TensorList orig, bool check_range) { +static std::tuple> makeLinearIndex(Tensor self, const c10::List>& orig, bool check_range) { checkIndexTensorTypes(orig); // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors auto indices = expandTensors(self, orig); @@ -184,7 +184,7 @@ static std::tuple>& indices, const Tensor & value, bool unsafe) { if (indices.size() > (size_t)self.dim()) { TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); } @@ -505,7 +505,7 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const if (cuda::detail::canUse32BitIndexMath(self) && cuda::detail::canUse32BitIndexMath(source) && cuda::detail::canUse32BitIndexMath(index)) { - AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] { cuda::detail::TensorInfo selfInfo = cuda::detail::getTensorInfo(self_); int selfAddDim = selfInfo.collapseDims(dim); diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h index 31e6d69aa0a1..8f78e8d78003 100644 --- a/aten/src/ATen/native/cuda/MiscUtils.h +++ b/aten/src/ATen/native/cuda/MiscUtils.h @@ -6,8 +6,8 @@ #include // for USE_MAGMA #ifdef USE_MAGMA -#include #include +#include #endif namespace at { diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu index 3953f16b69c9..3a24f00f6ebf 100644 --- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu @@ -28,7 +28,7 @@ void norm_kernel_cuda_impl(TensorIterator& iter, Scalar val) { } else if (p == static_cast(2)) { gpu_reduce_kernel(iter, NormTwoOps(), 0); } else if (p == static_cast(INFINITY)) { - gpu_reduce_kernel(iter, AbsMaxOps(), std::numeric_limits::min()); + gpu_reduce_kernel(iter, AbsMaxOps(), 0); } else if (p == static_cast(-INFINITY)) { gpu_reduce_kernel(iter, AbsMinOps(), std::numeric_limits::max()); } else { diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index db3e853a9321..e5e91cea4ccc 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -439,10 +440,10 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, // Calculates the normalization constant and applies it in-place to self // sizes is the sizes of a twosided tensor and dims are all transformed dims -void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { +double _fft_normalization_scale(int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { auto norm = static_cast(normalization); if (norm == fft_norm_mode::none) { - return; + return 1.0; } int64_t signal_numel = 1; @@ -451,7 +452,17 @@ void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArra } const double scale_denom = (norm == fft_norm_mode::by_root_n) ? std::sqrt(signal_numel) : static_cast(signal_numel); - self.div_(scale_denom); + return 1.0 / scale_denom; +} + +const Tensor& _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { + auto scale = _fft_normalization_scale(normalization, sizes, dims); + return (scale == 1.0) ? self : self.mul_(scale); +} + +Tensor& _fft_apply_normalization_out(Tensor& out, const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { + auto scale = _fft_normalization_scale(normalization, sizes, dims); + return at::mul_out(out, self, c10::scalar_to_tensor(scale)); } } // namespace (anonymous) @@ -522,6 +533,23 @@ Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization return output; } +Tensor& _fft_r2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim, + int64_t normalization, bool onesided) { + auto result = _fft_r2c_cufft(self, dim, static_cast(fft_norm_mode::none), /*onesided=*/true); + if (onesided) { + return _fft_apply_normalization_out(out, result, normalization, self.sizes(), dim); + } + + resize_output(out, self.sizes()); + + auto last_dim = dim.back(); + auto last_dim_halfsize = result.sizes()[last_dim]; + auto out_slice = out.slice(last_dim, 0, last_dim_halfsize); + _fft_apply_normalization_out(out_slice, result, normalization, self.sizes(), dim); + at::native::_fft_fill_with_conjugate_symmetry_(out, dim); + return out; +} + // n-dimensional complex to real IFFT Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) { TORCH_CHECK(self.is_complex()); @@ -544,8 +572,13 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization // TODO: could transform up to 2 other dims in the same cuFFT operation auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false); - _fft_apply_normalization(output, normalization, out_sizes, dim); - return output; + return _fft_apply_normalization(output, normalization, out_sizes, dim); +} + +Tensor& _fft_c2r_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim, + int64_t normalization, int64_t lastdim) { + auto result = _fft_c2r_cufft(self, dim, static_cast(fft_norm_mode::none), lastdim); + return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim); } // n-dimensional complex to complex FFT/IFFT @@ -586,8 +619,13 @@ Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization } } - _fft_apply_normalization(output, normalization, out_sizes, dim); - return output; + return _fft_apply_normalization(output, normalization, out_sizes, dim); +} + +Tensor& _fft_c2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim, + int64_t normalization, bool forward) { + auto result = _fft_c2c_cufft(self, dim, static_cast(fft_norm_mode::none), forward); + return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim); } diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index a435c7060f45..9dfa4e8759cf 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -87,7 +87,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) { // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work if (flip_dims_size == 1 && in_tensor.is_contiguous() && (flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, in_tensor.scalar_type(), "flip_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] { auto in_tensor_info = cuda::detail::getTensorInfo(in_tensor); auto out_tensor_info = cuda::detail::getTensorInfo(out_tensor); int flip_dim = in_tensor_info.collapseDims(flip_dims[0]); @@ -123,7 +123,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) { } } - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, in_tensor.scalar_type(), "flip_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] { flip_cuda_kernel<<>>( in_tensor.data_ptr(), out_tensor.data_ptr(), N, flip_dims_t.cuda().data_ptr(), diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index 99488108ac26..b269bd303e76 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -197,19 +197,21 @@ static void upsample_nearest1d_backward_out_cuda_template( } // namespace TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) ( - Tensor& output, const Tensor& input, IntArrayRef output_size, - c10::optional scales) { + c10::optional scales, + Tensor& output +) { upsample_nearest1d_out_cuda_template(output, input, output_size, scales); } TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cuda) ( - Tensor& grad_input, const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales) { + c10::optional scales, + Tensor& grad_input +) { upsample_nearest1d_backward_out_cuda_template( grad_input, grad_output, output_size, input_size, scales); } diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index 834c000fdb05..8ac7abca1824 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -168,43 +168,43 @@ __global__ void upsample_trilinear3d_backward_out_frame( true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1, h1, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p), i_numel, static_cast(t0lambda * h0lambda * w1lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1), + idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1), i_numel, static_cast(t0lambda * h1lambda * w0lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p), i_numel, static_cast(t0lambda * h1lambda * w1lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1), i_numel, static_cast(t1lambda * h0lambda * w0lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p), i_numel, static_cast(t1lambda * h0lambda * w1lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1), i_numel, static_cast(t1lambda * h1lambda * w0lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p), i_numel, static_cast(t1lambda * h1lambda * w1lambda * d2val), true); diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index 8fca9ad9ecdf..d5a39e45941b 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -21,6 +22,21 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, AT_ERROR("fft: ATen not compiled with MKL support"); } +Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool onesided) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + +Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + int64_t last_dim_size) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + +Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool forward) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + }} #else // AT_MKL_ENABLED @@ -381,6 +397,13 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false); } +Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + int64_t last_dim_size) { + auto result = _fft_c2r_mkl(self, dim, normalization, last_dim_size); + resize_output(out, result.sizes()); + return out.copy_(result); +} + // n-dimensional real to complex FFT Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { TORCH_CHECK(self.is_floating_point()); @@ -402,6 +425,24 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, return out; } +Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool onesided) { + auto result = _fft_r2c_mkl(self, dim, normalization, /*onesided=*/true); + if (onesided) { + resize_output(out, result.sizes()); + return out.copy_(result); + } + + resize_output(out, self.sizes()); + + auto last_dim = dim.back(); + auto last_dim_halfsize = result.sizes()[last_dim]; + auto out_slice = out.slice(last_dim, 0, last_dim_halfsize); + out_slice.copy_(result); + at::native::_fft_fill_with_conjugate_symmetry_(out, dim); + return out; +} + // n-dimensional complex to complex FFT/IFFT Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { TORCH_CHECK(self.is_complex()); @@ -410,6 +451,13 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward); } +Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool forward) { + auto result = _fft_c2c_mkl(self, dim, normalization, forward); + resize_output(out, result.sizes()); + return out.copy_(result); +} + }} // namespace at::native #endif diff --git a/aten/src/ATen/native/mkldnn/BinaryOps.cpp b/aten/src/ATen/native/mkldnn/BinaryOps.cpp index 029b1d225d14..3358079f4df5 100644 --- a/aten/src/ATen/native/mkldnn/BinaryOps.cpp +++ b/aten/src/ATen/native/mkldnn/BinaryOps.cpp @@ -8,10 +8,11 @@ namespace at { namespace native { Tensor& mkldnn_add_out( - Tensor& result, const Tensor& self, const Tensor& other, - Scalar alpha) { + Scalar alpha, + Tensor& result + ) { TORCH_CHECK(false, "mkldnn_add_out: ATen not compiled with MKLDNN support"); } @@ -46,10 +47,11 @@ namespace at { namespace native { Tensor& mkldnn_add_out( - Tensor& result, const Tensor& self, const Tensor& other, - Scalar alpha) { + Scalar alpha, + Tensor& result + ) { ideep::tensor& x = itensor_from_mkldnn(self); ideep::tensor& y = itensor_from_mkldnn(other); @@ -73,7 +75,7 @@ Tensor mkldnn_add(const Tensor& self, const Tensor& other, Scalar alpha) { } Tensor& mkldnn_add_(Tensor& self, const Tensor& other, Scalar alpha) { - return native::mkldnn_add_out(self, self, other, alpha); + return native::mkldnn_add_out(self, other, alpha, self); } Tensor& mkldnn_mul_out(Tensor& result, const Tensor& self, const Tensor& other) { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index a5b945399da8..215ca70bfbae 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -412,7 +412,7 @@ MkldnnCPU: mkldnn_add_ - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) - use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + use_c10_dispatcher: full structured: True structured_inherits: TensorIteratorBase dispatch: @@ -2197,6 +2197,13 @@ CPU: _fft_r2c_mkl CUDA: _fft_r2c_cufft +- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + variants: function + dispatch: + CPU: _fft_r2c_mkl_out + CUDA: _fft_r2c_cufft_out + # Complex to real inverse FFT - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor use_c10_dispatcher: full @@ -2205,6 +2212,13 @@ CPU: _fft_c2r_mkl CUDA: _fft_c2r_cufft +- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + variants: function + dispatch: + CPU: _fft_c2r_mkl_out + CUDA: _fft_c2r_cufft_out + # Standard complex to complex FFT (forward or backward) - func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor use_c10_dispatcher: full @@ -2213,6 +2227,13 @@ CPU: _fft_c2c_mkl CUDA: _fft_c2c_cufft +- func: _fft_c2c.out(Tensor self, int[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + variants: function + dispatch: + CPU: _fft_c2c_mkl_out + CUDA: _fft_c2c_cufft_out + - func: _cufft_get_plan_cache_size(int device_index) -> int use_c10_dispatcher: full @@ -2226,6 +2247,7 @@ use_c10_dispatcher: full - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor + use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: index @@ -2254,6 +2276,7 @@ variants: function, method - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!) + use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: index_put_ @@ -2264,9 +2287,11 @@ # - Tensor & Tensor::index_put_(std::initializer_list indices, Scalar v) - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor + use_c10_dispatcher: full variants: function, method - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) + use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _index_put_impl_ @@ -9435,7 +9460,7 @@ CUDA: upsample_trilinear3d_backward_cuda - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!) - use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + use_c10_dispatcher: full python_module: nn structured: True dispatch: @@ -9448,7 +9473,7 @@ structured_delegate: upsample_nearest1d.out - func: upsample_nearest1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) - use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + use_c10_dispatcher: full python_module: nn structured: True dispatch: @@ -9885,81 +9910,161 @@ python_module: fft variants: function +- func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor use_c10_dispatcher: full python_module: fft variants: function +- func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function +- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function +- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor use_c10_dispatcher: full python_module: fft @@ -10225,3 +10330,14 @@ - func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor use_c10_dispatcher: full python_module: nn + +# Note: this function is only for testing. +- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor + use_c10_dispatcher: full + python_module: nn + +# Note: this function is only for testing. +- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor + cpp_no_default_args: ['a', 'b'] + use_c10_dispatcher: full + python_module: nn diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index b7d893ad55fc..05762bfb036f 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -746,7 +746,7 @@ at::Tensor PackedConvWeightsQnnp::apply_impl( run_status == pytorch_qnnp_status_success, "failed to run quantized::conv2d (qnnpack) operator"); - return output.contiguous(act.suggest_memory_format()); + return output; } template at::Tensor PackedConvWeightsQnnp<2>::apply( diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index d621efafee41..fb7e16539c15 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -14,7 +15,6 @@ namespace at { namespace native { using namespace at::sparse; - /****************************************************************************** * access methods ******************************************************************************/ @@ -328,7 +328,7 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){ Tensor values; if (self.dim() > 0) { - std::vector ix = indices.chunk(indices.size(0), 0); + auto ix = toListOfOptionalTensors(indices.chunk(indices.size(0), 0)); values = self.index(ix).squeeze(0).clone(at::MemoryFormat::Preserve); } else { AT_ASSERT(nz.sizes().equals({0, 1})); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 60df74061c7a..9bb679beb3d0 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -544,7 +544,7 @@ SparseTensor& add_out_sparse_non_contiguous(SparseTensor& r, const SparseTensor& Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, Scalar value); -SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) { +SparseTensor& add_out_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar value, SparseTensor& r) { if (!t.is_sparse()) { return add_out_dense_sparse_cpu(r, t, src, value); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 753ea9fa4937..c8366f71618e 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -399,7 +399,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT Tensor& add_out_dense_sparse_cuda(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, Scalar value); -SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) { +SparseTensor& add_out_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar value, SparseTensor& r_) { if (!t.is_sparse()) { return add_out_dense_sparse_cuda(r_, t, src, value); } diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl index 58394dca19da..2c02e034603e 100644 --- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl +++ b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl @@ -1,7 +1,6 @@ #version 450 core #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) readonly buffer kernel { vec4 data[]; diff --git a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl index d5b9af843dbe..75243a69bca3 100644 --- a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/add.glsl b/aten/src/ATen/native/vulkan/glsl/add.glsl index 8dcff0476edf..361927373a49 100644 --- a/aten/src/ATen/native/vulkan/glsl/add.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/add_.glsl b/aten/src/ATen/native/vulkan/glsl/add_.glsl index ed82d0cbe87b..d6360a376c58 100644 --- a/aten/src/ATen/native/vulkan/glsl/add_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add_.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl index 8882ba0d8ff2..735086a8150a 100644 --- a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl index bffd680669fb..a418a28bb5c3 100644 --- a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/addmm.glsl b/aten/src/ATen/native/vulkan/glsl/addmm.glsl index 61f76fa8cf5d..a8f09252a167 100644 --- a/aten/src/ATen/native/vulkan/glsl/addmm.glsl +++ b/aten/src/ATen/native/vulkan/glsl/addmm.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl index df2bbcf18014..5de8cf13225f 100644 --- a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/clamp.glsl b/aten/src/ATen/native/vulkan/glsl/clamp.glsl index c394dfd26627..52c2d2d96c26 100644 --- a/aten/src/ATen/native/vulkan/glsl/clamp.glsl +++ b/aten/src/ATen/native/vulkan/glsl/clamp.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl index b16258685114..3f138bb93ec6 100644 --- a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl index 9646eb8c9f19..bb2508aefe65 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl index fe50262f7d46..0f49515718b2 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl index 37a5898b9f10..5155c07669c1 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl @@ -1,7 +1,6 @@ #version 450 core #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl index b73c58e0f54d..89411284fed4 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl @@ -1,7 +1,6 @@ #version 450 core #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl index 5cef89c2727f..8baae9b5fcd5 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl @@ -1,7 +1,6 @@ #version 450 core #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel; diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl index 48d9f785008b..1355b2c09b05 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl index d19c370ec9bd..01d653bf06de 100644 --- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl index 948b797a5207..88373605d010 100644 --- a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl @@ -1,7 +1,6 @@ #version 450 core #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput; layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput; layout(set = 0, binding = 2) uniform constBlock { diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl index 130d716ca9e6..551fd747f103 100644 --- a/aten/src/ATen/native/vulkan/glsl/mean.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mean.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl index 266226aa708b..b8d0add329f2 100644 --- a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/mm.glsl b/aten/src/ATen/native/vulkan/glsl/mm.glsl index 00ab5f31e6db..157acfe9c074 100644 --- a/aten/src/ATen/native/vulkan/glsl/mm.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mm.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl index d3a98ba30bea..c0ae48fe3883 100644 --- a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl index b49252e128cc..f959052879ad 100644 --- a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl index fb87b5a36918..adbafcbd0438 100644 --- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl +++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl index af8e33588f78..3d1191ff6eea 100644 --- a/aten/src/ATen/native/vulkan/glsl/permute.glsl +++ b/aten/src/ATen/native/vulkan/glsl/permute.glsl @@ -1,6 +1,5 @@ #version 450 core layout(std430) buffer; -layout(std430) uniform; layout(set = 0, binding = 0) writeonly buffer outputBuffer { float data[]; } diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl index efb1c5c7fc9a..b4db9b87dacb 100644 --- a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl @@ -2,7 +2,6 @@ #define PRECISION $precision layout(std430) buffer; -layout(std430) uniform; /* Qualifiers: layout - storage - precision - memory */ diff --git a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp index e8442a64d0ad..da13fb9574d5 100644 --- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp +++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp @@ -73,21 +73,21 @@ TORCH_LIBRARY(xnnpack, m) { } TORCH_LIBRARY(prepacked, m) { - m.def("linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext"); - m.def("linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y"); - m.def("conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext"); - m.def("conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext"); - m.def("conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y"); - m.def("conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y"); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y")); } TORCH_LIBRARY_IMPL(prepacked, CPU, m) { - m.impl("linear_clamp_prepack", TORCH_FN(createLinearClampPrePackOpContext)); - m.impl("linear_clamp_run", TORCH_FN(internal::linear::linear_clamp_run)); - m.impl("conv2d_clamp_prepack", TORCH_FN(createConv2dClampPrePackOpContext)); - m.impl("conv2d_transpose_clamp_prepack", TORCH_FN(createConv2dTransposeClampPrePackOpContext)); - m.impl("conv2d_clamp_run", TORCH_FN(internal::convolution2d::conv2d_clamp_run)); - m.impl("conv2d_transpose_clamp_run", TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_prepack"), TORCH_FN(createLinearClampPrePackOpContext)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_run"), TORCH_FN(internal::linear::linear_clamp_run)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_prepack"), TORCH_FN(createConv2dClampPrePackOpContext)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_prepack"), TORCH_FN(createConv2dTransposeClampPrePackOpContext)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_clamp_run)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run)); } } // namespace xnnpack diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index d42c8c23fe9c..0dfef701c51b 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -28,6 +28,7 @@ class Tensor; } namespace c10{ struct TensorOptions; +template class List; } namespace at { struct Generator; @@ -207,10 +208,6 @@ class TORCH_API Tensor { Tensor& operator=(const Tensor&) &&; Tensor& operator=(Tensor&&) &&; - #ifdef _MSC_VER - #pragma warning( pop ) - #endif - bool is_same(const Tensor& other) const noexcept { return impl_ == other.impl_; } @@ -760,6 +757,12 @@ class TORCH_API Tensor { c10::intrusive_ptr impl_; }; +// For "multiple ... operators specified" warnings, closing brace of class +// declaration must be included between pragma push & pop +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + int64_t get_device(Tensor self); template diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp index 68c0b4f3f71a..3b7bfb47fe62 100644 --- a/aten/src/ATen/test/scalar_test.cpp +++ b/aten/src/ATen/test/scalar_test.cpp @@ -138,3 +138,23 @@ TEST(TestScalar, TestConj) { ASSERT_EQ(float_scalar.conj().toDouble(), 3.0); ASSERT_EQ(complex_scalar.conj().toComplexDouble(), c10::complex(2.3, -3.5)); } + +TEST(TestScalar, TestEqual) { + ASSERT_FALSE(Scalar(1.0).equal(false)); + ASSERT_FALSE(Scalar(1.0).equal(true)); + ASSERT_FALSE(Scalar(true).equal(1.0)); + ASSERT_TRUE(Scalar(true).equal(true)); + + ASSERT_TRUE(Scalar(c10::complex{2.0, 5.0}).equal(c10::complex{2.0, 5.0})); + ASSERT_TRUE(Scalar(c10::complex{2.0, 0}).equal(2.0)); + ASSERT_TRUE(Scalar(c10::complex{2.0, 0}).equal(2)); + + ASSERT_TRUE(Scalar(2.0).equal(c10::complex{2.0, 0.0})); + ASSERT_FALSE(Scalar(2.0).equal(c10::complex{2.0, 4.0})); + ASSERT_FALSE(Scalar(2.0).equal(3.0)); + ASSERT_TRUE(Scalar(2.0).equal(2)); + + ASSERT_TRUE(Scalar(2).equal(c10::complex{2.0, 0})); + ASSERT_TRUE(Scalar(2).equal(2)); + ASSERT_TRUE(Scalar(2).equal(2.0)); +} diff --git a/aten/src/THC/THCTensorMathMagma.cu b/aten/src/THC/THCTensorMathMagma.cu index ce6ca38afd2b..36316a6bf2eb 100644 --- a/aten/src/THC/THCTensorMathMagma.cu +++ b/aten/src/THC/THCTensorMathMagma.cu @@ -8,7 +8,7 @@ #include #ifdef USE_MAGMA -#include +#include #endif #ifndef DIVUP diff --git a/aten/src/THC/THCTensorMathMagma.cuh b/aten/src/THC/THCTensorMathMagma.cuh index 5ceac465c317..1fb5821afce5 100644 --- a/aten/src/THC/THCTensorMathMagma.cuh +++ b/aten/src/THC/THCTensorMathMagma.cuh @@ -2,7 +2,7 @@ #define THC_TENSOR_MATH_MAGMA_CUH #ifdef USE_MAGMA -#include +#include #endif #ifdef USE_MAGMA diff --git a/benchmarks/functional_autograd_benchmark/ppl_models.py b/benchmarks/functional_autograd_benchmark/ppl_models.py index 906ebac5d41b..94ba6698a91d 100644 --- a/benchmarks/functional_autograd_benchmark/ppl_models.py +++ b/benchmarks/functional_autograd_benchmark/ppl_models.py @@ -24,8 +24,9 @@ def forward(beta_value: Tensor) -> Tensor: mu = X.mm(beta_value) # We need to compute the first and second gradient of this score with respect - # to beta_value. - score = dist.Bernoulli(logits=mu).log_prob(Y).sum() + beta_prior.log_prob(beta_value).sum() + # to beta_value. We disable Bernoulli validation because Y is a relaxed value. + score = (dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum() + + beta_prior.log_prob(beta_value).sum()) return score return forward, (beta_value.to(device),) @@ -40,7 +41,7 @@ def get_robust_regression(device: torch.device) -> GetterReturnType: Y = torch.rand(N, 1, device=device) # Predefined nu_alpha and nu_beta, nu_alpha.shape: (1, 1), nu_beta.shape: (1, 1) - nu_alpha = torch.randn(1, 1, device=device) + nu_alpha = torch.rand(1, 1, device=device) nu_beta = torch.rand(1, 1, device=device) nu = dist.Gamma(nu_alpha, nu_beta) diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt index 48bceb440954..b175e5bdd6ce 100644 --- a/c10/CMakeLists.txt +++ b/c10/CMakeLists.txt @@ -23,7 +23,7 @@ configure_file( ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h) # Note: if you want to add ANY dependency to the c10 library, make sure you -# check with the core PyTorch developers as the dependendency will be +# check with the core PyTorch developers as the dependency will be # transitively passed on to all libraries dependent on PyTorch. file(GLOB C10_SRCS *.cpp diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 486272ece92e..58d456b950ed 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -124,7 +124,7 @@ class DispatchKeySet final { public: // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the // set. The iterator is only invalidated by the destruction of the underlying - // DispatchKeySet as the iterator stores a pointer to the raw represenation of + // DispatchKeySet as the iterator stores a pointer to the raw representation of // the DispatchKeySet. class iterator { public: @@ -235,7 +235,7 @@ C10_API DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t); C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t); // This API exists because we have a use case for checking -// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefind) +// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined) // in OperatorEntry.cpp but we disallow it in has() API. C10_API bool isIncludedInAlias(DispatchKey k, DispatchKey alias); diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h index e25814cd0717..6528f6c8f110 100644 --- a/c10/core/MemoryFormat.h +++ b/c10/core/MemoryFormat.h @@ -98,7 +98,7 @@ inline std::vector get_channels_last_strides_3d(IntArrayRef sizes) { // 1. Please do not combine these helper functions, each helper function handles // exactly one case of sizes + memory_format, by doing this, the strides indices // will be a constant array and we can access it using constant index number, -// the complier will fully unroll the loop on strides indices to gain a better +// the compiler will fully unroll the loop on strides indices to gain a better // performance. // 2. No error check in helper function, caller ensures the correctness of the input // 3. All helper functions have similar comments, only 1st helper function is commented here. @@ -205,7 +205,7 @@ inline bool is_channels_last_strides_3d_s5(const IntArrayRef sizes, const IntArr // a. we identify corner cases where the implementation compromises on. // // By the time accumulated permutation is enabled to replace implicit -// memory_foramt through strides, we should be updating our tests and fix the +// memory_format through strides, we should be updating our tests and fix the // issues in our tests. // // We use Channels Last 2d as an example above. diff --git a/c10/core/Scalar.cpp b/c10/core/Scalar.cpp index 35aa5d60f001..203b544924ec 100644 --- a/c10/core/Scalar.cpp +++ b/c10/core/Scalar.cpp @@ -3,7 +3,7 @@ namespace c10 { Scalar Scalar::operator-() const { - TORCH_CHECK(!isBoolean(), "torch boolean negative, the `-` operator, is not suppported."); + TORCH_CHECK(!isBoolean(), "torch boolean negative, the `-` operator, is not supported."); if (isFloatingPoint()) { return Scalar(-v.d); } else if (isComplex()) { @@ -21,4 +21,14 @@ Scalar Scalar::conj() const { } } +Scalar Scalar::log() const { + if (isComplex()) { + return std::log(v.z); + } else if (isFloatingPoint()) { + return std::log(v.d); + } else { + return std::log(v.i); + } +} + } // namespace c10 diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h index 6151f6d2b150..368228e8202e 100644 --- a/c10/core/Scalar.h +++ b/c10/core/Scalar.h @@ -88,6 +88,45 @@ class C10_API Scalar { Scalar operator-() const; Scalar conj() const; + Scalar log() const; + + template::value, int>::type = 0> + bool equal(T num) const { + if (isComplex()) { + auto val = v.z; + return (val.real() == num) && (val.imag() == T()); + } else if (isFloatingPoint()) { + return v.d == num; + } else if (isIntegral(/*includeBool=*/false)) { + return v.i == num; + } else { + // boolean scalar does not equal to a non boolean value + return false; + } + } + + template::value, int>::type = 0> + bool equal(T num) const { + if (isComplex()) { + return v.z == num; + } else if (isFloatingPoint()) { + return (v.d == num.real()) && (num.imag() == T()); + } else if (isIntegral(/*includeBool=*/false)) { + return (v.i == num.real()) && (num.imag() == T()); + } else { + // boolean scalar does not equal to a non boolean value + return false; + } + } + + bool equal(bool num) const { + if (isBoolean()) { + return static_cast(v.i) == num; + } else { + return false; + } + } + ScalarType type() const { if (isComplex()) { return ScalarType::ComplexDouble; diff --git a/c10/core/Stream.cpp b/c10/core/Stream.cpp index 9a5c838c73fe..1a56c9d68567 100644 --- a/c10/core/Stream.cpp +++ b/c10/core/Stream.cpp @@ -2,7 +2,7 @@ namespace c10 { -// Not very parseable, but I don't know a good compact syntax for streams. +// Not very parsable, but I don't know a good compact syntax for streams. // Feel free to change this into something more compact if needed. std::ostream& operator<<(std::ostream& stream, const Stream& s) { stream << "stream " << s.id() << " on device " << s.device(); diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 3326404e1d07..e7f9c1260263 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -19,7 +19,7 @@ #include // A global boolean variable to control whether we free memory when a Tensor -// is shrinked to a smaller size. As a result, a Tensor is always going to +// is shrunk to a smaller size. As a result, a Tensor is always going to // keep the memory allocated for its maximum capacity reshaped to so far. // // This parameter is respected "upper-case" methods which call Resize() @@ -625,7 +625,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * The API is as follows: * - "new_grad" is a Tensor containing the new value of the gradient that should * be set - * - "self" should reprensent the Tensor whose forward grad is accessed. It is + * - "self" should represent the Tensor whose forward grad is accessed. It is * required when dealing with view. * - "level" allows to specify the level of forward AD nesting for which the * gradient should be set. Note that since levels are not fully supported @@ -1381,7 +1381,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // error in attempt to invoke TypeMeta::ctor() static_assert( std::is_default_constructible::value, - "Tensor can't hold non-default-constructible types"); + "Tensor can't hold non-default-constructable types"); return static_cast(raw_mutable_data(caffe2::TypeMeta::Make())); } diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h index 2ef02b57d3be..258f8953f4de 100644 --- a/c10/core/impl/DeviceGuardImplInterface.h +++ b/c10/core/impl/DeviceGuardImplInterface.h @@ -126,7 +126,7 @@ struct C10_API DeviceGuardImplInterface { /** * Increments the event's version and enqueues a job with this version * in the stream's work queue. When the stream process that job - * it nofifies all streams waiting on / blocked by that version of the + * it notifies all streams waiting on / blocked by that version of the * event to continue and marks that version as recorded. * */ virtual void record( diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt index c8fa53df6f02..256fc54b08a1 100644 --- a/c10/cuda/CMakeLists.txt +++ b/c10/cuda/CMakeLists.txt @@ -13,7 +13,7 @@ configure_file( ${CMAKE_BINARY_DIR}/c10/cuda/impl/cuda_cmake_macros.h) # Note: if you want to add ANY dependency to the c10 library, make sure you -# check with the core PyTorch developers as the dependendency will be +# check with the core PyTorch developers as the dependency will be # transitively passed on to all libraries dependent on PyTorch. # Note: if you add a new source file/header, you will need to update diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 0b5d2992538c..493296248e5b 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -62,7 +62,7 @@ constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 M constexpr size_t kSmallBuffer = 2097152; // "small" allocations are packed in 2 MiB blocks constexpr size_t kLargeBuffer = 20971520; // "large" allocations may be packed in 20 MiB blocks constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer -constexpr size_t kRoundLarge = 2097152; // round up large allocs to 2 MiB +constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB typedef std::bitset(StatType::NUM_TYPES)> StatTypes; diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp index 457331f4a00d..d1e290c3f02c 100644 --- a/c10/cuda/CUDAStream.cpp +++ b/c10/cuda/CUDAStream.cpp @@ -60,7 +60,7 @@ static LeakyStreamInternals default_streams[C10_COMPILE_TIME_MAX_GPUS]; // in the pool to be returned when a stream is requested (round-robin fashion // , see the note in CUDAStream.h). // -// unique_ptr is used instead of vector because T might be non-moveable +// unique_ptr is used instead of vector because T might be non-movable // and non-copyable. static std::once_flag device_flags[C10_COMPILE_TIME_MAX_GPUS]; static std::atomic low_priority_counters[C10_COMPILE_TIME_MAX_GPUS]; diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h index 41802b3bc9ef..05eddf5ce122 100644 --- a/c10/cuda/CUDAStream.h +++ b/c10/cuda/CUDAStream.h @@ -152,7 +152,7 @@ class C10_CUDA_API CUDAStream { static std::tuple priority_range() { // Note: this returns the range of priority **supported by PyTorch**, not // the range of priority **supported by CUDA**. The former is a subset of - // the latter. Curently PyTorch only supports 0 and -1, which are "low" and + // the latter. Currently PyTorch only supports 0 and -1, which are "low" and // "high" priority. int least_priority, greatest_priority; C10_CUDA_CHECK( diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h index 46ff50621417..5499a7d8b81c 100644 --- a/c10/macros/Macros.h +++ b/c10/macros/Macros.h @@ -316,7 +316,7 @@ __host__ __device__ #define C10_MOBILE 1 #endif // ANDROID / IOS -// Portably determine if a type T is trivially copyable or not. +// Portable determination of whether type T is trivially copyable. // Warning: __has_trivial_copy for GCC may not always detect the non-POD // correctly. For example, T = std::unique_ptr may evaluate to true and be // treated as POD. This can cause unexpected behavior. diff --git a/c10/mobile/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp index bde4067d45dc..0114856ca89b 100644 --- a/c10/mobile/CPUCachingAllocator.cpp +++ b/c10/mobile/CPUCachingAllocator.cpp @@ -61,7 +61,7 @@ void CPUCachingAllocator::record_free(void* ptr) { // is being freed outside the scope of this allocator. // At the moment only way to capture this is to have the allocator, // that uses this CachingAllocator as the backing allocator, - // call this function explicity upon freeing memory while + // call this function explicitly upon freeing memory while // outside the scope of caching allocator. // If the memory is freed in some other way, then we will likely // have undefined behavior or page fault. But this can be diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h index 2f11e6ea8669..c80fee0682eb 100644 --- a/c10/mobile/CPUCachingAllocator.h +++ b/c10/mobile/CPUCachingAllocator.h @@ -26,7 +26,7 @@ * What are the cons? * There are some cons that were observed where use of caching allocator led to * worse performance on some platforms. Reason being that the caching mechanism - * used by this allocator left us worse off compared to the corresonding platform's + * used by this allocator left us worse off compared to the corresponding platform's * tuned memory allocator. In that case it seemed better to not use this allocator. * Note there are some ideas to fix this in the works. * @@ -63,7 +63,7 @@ class C10_API CPUCachingAllocator { // returned the memory to OS via free_cached. // 1.1. Therefore even when the said memory is "freed" via this // allocator (and thus cached), it will continue to stay - // in allocaiton_map_. Furthermore it will also exist in + // in allocation_map_. Furthermore it will also exist in // available_map_. Thus an allocated memory pointer can be in both // allocation_map_ and available_map_ simultaneously. // 2. Memory pointer maybe removed from allocation_map_, when it diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp index 5f2b28b4b2d0..0118d0a29587 100644 --- a/c10/mobile/CPUProfilingAllocator.cpp +++ b/c10/mobile/CPUProfilingAllocator.cpp @@ -133,7 +133,7 @@ std::vector formulate_greedy_allocation_plan( ska::flat_hash_map::iterator> free_end_offset_to_size_iter; // Upon free end_ptr = offset + size // If end_ptr exists merge freed allocation - // Also find coresponding offset in size_to_offet + // Also find corresponding offset in size_to_offset // Remove that entry and update with new size and offset // If end_ptr does not exist then just insert offset,size // in map and correspondingly size, offset in the other map. @@ -176,7 +176,7 @@ std::vector formulate_greedy_allocation_plan( } allocation_offsets[mem_event.allocation_id] = alloc_offset; } else { - // 1. Check if freed block is adjancent to an existing free block + // 1. Check if freed block is adjacent to an existing free block // at its end boundary. This is done by checking // free_end_offset_to_size_iter. // If we find such a block, remove it and adjust size of @@ -186,7 +186,7 @@ std::vector formulate_greedy_allocation_plan( // free_start_offset_to_size_iter. // If we find such a block, remove it and adjust size of // the block being freed. - // 3. Inser the freed block in map. + // 3. Insert the freed block in map. auto freed_offset = allocation_offsets[mem_event.allocation_id]; auto freed_size = mem_event.size; auto end_offset = freed_offset + freed_size; @@ -223,7 +223,7 @@ std::vector formulate_greedy_allocation_plan( } } TORCH_CHECK(validate_allocation_plan(mem_events, allocation_offsets), - "ProfilingAllocator: Allocation plan invaild."); + "ProfilingAllocator: Allocation plan invalid."); return allocation_offsets; } @@ -394,7 +394,7 @@ CPUProfilingAllocator::~CPUProfilingAllocator() { WithProfileAllocationsGuard::WithProfileAllocationsGuard( AllocationPlan* plan) { - // Nesting of allocation profiling does not seem meanigful. + // Nesting of allocation profiling does not seem meaningful. TORCH_CHECK(allocation_planner == nullptr, "Nesting profiling allocations is not supported."); planner_ = std::make_unique(plan); @@ -409,7 +409,7 @@ WithProfileAllocationsGuard::~WithProfileAllocationsGuard() { WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard( AllocationPlan* plan, bool* success) { - // Nesting of allocation profiling does not seem meanigful. + // Nesting of allocation profiling does not seem meaningful. TORCH_CHECK(allocation_planner == nullptr, "Nesting profiling allocations is not supported."); planner_ = std::make_unique(plan, true); diff --git a/c10/test/util/bfloat16_test.cpp b/c10/test/util/bfloat16_test.cpp index d08f512053ab..af00bab99c5b 100644 --- a/c10/test/util/bfloat16_test.cpp +++ b/c10/test/util/bfloat16_test.cpp @@ -87,7 +87,7 @@ namespace { } TEST(BFloat16Math, Addition) { - // This test verifies that if only first 7 bits of float's mantisa are + // This test verifies that if only first 7 bits of float's mantissa are // changed after addition, we should have no loss in precision. // input bits @@ -108,8 +108,8 @@ namespace { EXPECT_EQ(res, expected); } - TEST(BFloat16Math, Substraction) { - // This test verifies that if only first 7 bits of float's mantisa are + TEST(BFloat16Math, Subtraction) { + // This test verifies that if only first 7 bits of float's mantissa are // changed after subtraction, we should have no loss in precision. // input bits diff --git a/c10/test/util/intrusive_ptr_test.cpp b/c10/test/util/intrusive_ptr_test.cpp index 2ea283d1a4f0..9df5b004a094 100644 --- a/c10/test/util/intrusive_ptr_test.cpp +++ b/c10/test/util/intrusive_ptr_test.cpp @@ -694,21 +694,21 @@ TEST(IntrusivePtrTest, Equality_Nullptr) { EXPECT_FALSE(var1 != var2); } -TEST(IntrusivePtrTest, Nonequality) { +TEST(IntrusivePtrTest, Inequality) { intrusive_ptr var1 = make_intrusive(); intrusive_ptr var2 = make_intrusive(); EXPECT_TRUE(var1 != var2); EXPECT_FALSE(var1 == var2); } -TEST(IntrusivePtrTest, Nonequality_NullptrLeft) { +TEST(IntrusivePtrTest, Inequality_NullptrLeft) { intrusive_ptr var1; intrusive_ptr var2 = make_intrusive(); EXPECT_TRUE(var1 != var2); EXPECT_FALSE(var1 == var2); } -TEST(IntrusivePtrTest, Nonequality_NullptrRight) { +TEST(IntrusivePtrTest, Inequality_NullptrRight) { intrusive_ptr var1 = make_intrusive(); intrusive_ptr var2; EXPECT_TRUE(var1 != var2); @@ -2487,28 +2487,28 @@ TEST(WeakIntrusivePtrTest, Equality_Invalid) { EXPECT_FALSE(var1 != var2); } -TEST(WeakIntrusivePtrTest, Nonequality) { +TEST(WeakIntrusivePtrTest, Inequality) { IntrusiveAndWeak var1 = make_intrusive(); IntrusiveAndWeak var2 = make_intrusive(); EXPECT_TRUE(var1.weak != var2.weak); EXPECT_FALSE(var1.weak == var2.weak); } -TEST(WeakIntrusivePtrTest, Nonequality_InvalidLeft) { +TEST(WeakIntrusivePtrTest, Inequality_InvalidLeft) { weak_intrusive_ptr var1 = make_invalid_weak(); IntrusiveAndWeak var2 = make_intrusive(); EXPECT_TRUE(var1 != var2.weak); EXPECT_FALSE(var1 == var2.weak); } -TEST(WeakIntrusivePtrTest, Nonequality_InvalidRight) { +TEST(WeakIntrusivePtrTest, Inequality_InvalidRight) { IntrusiveAndWeak var1 = make_intrusive(); weak_intrusive_ptr var2 = make_invalid_weak(); EXPECT_TRUE(var1.weak != var2); EXPECT_FALSE(var1.weak == var2); } -TEST(WeakIntrusivePtrTest, Nonequality_WeakOnly) { +TEST(WeakIntrusivePtrTest, Inequality_WeakOnly) { weak_intrusive_ptr var1 = make_weak_only(); weak_intrusive_ptr var2 = make_weak_only(); EXPECT_TRUE(var1 != var2); diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h index e849563e60fe..964146be05e7 100644 --- a/c10/util/Bitset.h +++ b/c10/util/Bitset.h @@ -64,7 +64,7 @@ struct bitset final { bitset cur = *this; size_t index = cur.find_first_set(); while (0 != index) { - // -1 because find_first_set() is not one-indiced. + // -1 because find_first_set() is not one-indexed. index -= 1; func(index); cur.unset(index); @@ -73,7 +73,7 @@ struct bitset final { } private: - // Return the index of the first set bit. The returned index is one-indiced + // Return the index of the first set bit. The returned index is one-indexed // (i.e. if the very first bit is set, this function returns '1'), and a return // of '0' means that there was no bit set. size_t find_first_set() const { diff --git a/c10/util/Flags.h b/c10/util/Flags.h index 6bfe62507fcd..b4352510c997 100644 --- a/c10/util/Flags.h +++ b/c10/util/Flags.h @@ -4,7 +4,7 @@ /* Commandline flags support for C10. * * This is a portable commandline flags tool for c10, so we can optionally - * choose to use gflags or a lightweighted custom implementation if gflags is + * choose to use gflags or a lightweight custom implementation if gflags is * not possible on a certain platform. If you have gflags installed, set the * macro C10_USE_GFLAGS will seamlessly route everything to gflags. * diff --git a/c10/util/Logging.h b/c10/util/Logging.h index acab3cfecd23..6fa7e93f26d8 100644 --- a/c10/util/Logging.h +++ b/c10/util/Logging.h @@ -284,7 +284,7 @@ BINARY_COMP_HELPER(LessEquals, <=) * Very lightweight logging for the first time API usage. It's beneficial for * tracking of individual functionality usage in larger applications. * - * In order to ensure light-weightness of logging, we utilize static variable + * In order to ensure light-weightedness of logging, we utilize static variable * trick - LogAPIUsage will be invoked only once and further invocations will * just do an atomic check. * diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h index 076a1d401065..9b32d8edfe7f 100644 --- a/c10/util/SmallVector.h +++ b/c10/util/SmallVector.h @@ -832,7 +832,7 @@ SmallVectorImpl& SmallVectorImpl::operator=( // If we have to grow to have enough elements, destroy the current elements. // This allows us to avoid copying them during the grow. - // FIXME: don't do this if they're efficiently moveable. + // FIXME: don't do this if they're efficiently movable. if (this->capacity() < RHSSize) { // Destroy current elements. this->destroy_range(this->begin(), this->end()); diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h index df15509d7e0f..85513ecc5e2f 100644 --- a/c10/util/TypeCast.h +++ b/c10/util/TypeCast.h @@ -44,7 +44,7 @@ struct static_cast_with_inter_type { // Note: Converting from negative float values to unsigned integer types is // undefined behavior in C++, and current CPU and GPU compilers exhibit // divergent behavior. Casting from negative float values to signed -// integer types and then to unsigned integer types is not undefiend, +// integer types and then to unsigned integer types is not undefined, // however, so this cast improves the consistency of type conversions // to uint8 across compilers. // Further note: Type conversions across compilers still have other undefined diff --git a/c10/util/complex.h b/c10/util/complex.h index 2578da2957ab..d4d5525170af 100644 --- a/c10/util/complex.h +++ b/c10/util/complex.h @@ -61,7 +61,7 @@ namespace c10 { // Since we only support float and double, on will use `complex& operator=(T x)` // - Copy assignment operator and converting assignment operator // - There is no specialization of converting assignment operators, which type is -// convertible is soly depend on whether the scalar type is convertable +// convertible is solely dependent on whether the scalar type is convertible // // In addition to the standard assignment, we also provide assignment operators with std and thrust // diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h index 761dd27d6d46..637db95991f2 100644 --- a/c10/util/intrusive_ptr.h +++ b/c10/util/intrusive_ptr.h @@ -700,7 +700,7 @@ class weak_intrusive_ptr final { /** * Takes an owning (but must be weakly referenced) pointer to TTarget* and * creates a weak_intrusive_ptr that takes over ownership. - * Thas means the weakcount is not increased. + * This means that the weakcount is not increased. * This is the counter-part to weak_intrusive_ptr::release() and the pointer * passed in *must* have been created using weak_intrusive_ptr::release(). */ diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp index f3fe048b4cca..79c093cbeb31 100644 --- a/c10/util/typeid.cpp +++ b/c10/util/typeid.cpp @@ -60,7 +60,7 @@ CAFFE_KNOWN_TYPE(bool*) CAFFE_KNOWN_TYPE(char*) CAFFE_KNOWN_TYPE(int*) -// For some of the compilers, long is definied separately from int32_t and +// For some of the compilers, long is defined separately from int32_t and // int64_t. As a result we will need to actually define them separately. // It is recommended that one does NOT use long - use int32_t and int64_t // explicitly. Explicit long type annotation may go away in the future. diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 4fcf86be55e2..191a7ca26835 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -479,6 +479,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) # This one needs to be unconditionally added as Functions.cpp is also unconditionally added list(APPEND TORCH_SRCS ${TORCH_SRC_DIR}/csrc/autograd/FunctionsManual.cpp + ${TORCH_SRC_DIR}/csrc/utils/out_types.cpp ) if(NOT INTERN_DISABLE_AUTOGRAD) diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc index 9e7479141ad4..dba68d21c2dd 100644 --- a/caffe2/contrib/aten/aten_op.cc +++ b/caffe2/contrib/aten/aten_op.cc @@ -6,13 +6,17 @@ namespace caffe2 { namespace internal { at::Tensor index_with_uint8_handling( const at::Tensor& self, - at::TensorList indices) { + const torch::List>& indices) { // Support BC only for the simplest case of mask indexing - if (indices.size() == 1 && indices[0].scalar_type() == at::kByte) { - TORCH_WARN( - "Indexing with uint8 mask tensor in ATenOp is now deprecated," - " please use a bool mask instead."); - return at::index(self, {indices[0].to(at::kBool)}); + if (indices.size() == 1) { + c10::optional first = indices[0]; + if (first.has_value() + && first->scalar_type() == at::kByte) { + TORCH_WARN( + "Indexing with uint8 mask tensor in ATenOp is now deprecated," + " please use a bool mask instead."); + return at::index(self, {first->to(at::kBool)}); + } } return at::index(self, indices); } diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index f3a42dbd8f59..cd1ce7651b48 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -21,7 +21,7 @@ using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...) namespace internal { TORCH_API at::Tensor index_with_uint8_handling( const at::Tensor& self, - at::TensorList indices); + const torch::List>& indices); } template @@ -86,6 +86,16 @@ class ATenOp : public Operator { std::vector peekSlice(size_t i, size_t len, size_t N) { std::vector results; + results.reserve(len); + for (size_t ii = i; ii < i + len; ++ii) { + results.push_back(peek(ii, N)); + } + return results; + } + + torch::List> peekSliceOptionals(size_t i, size_t len, size_t N) { + torch::List> results; + results.reserve(len); for (size_t ii = i; ii < i + len; ++ii) { results.push_back(peek(ii, N)); } diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index 2a822058bfdf..769f9d59c856 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -68,7 +68,7 @@ def value_has_tensors(v): def value_is_tensor_type(v): - return value_has_tensors(v) and v['dynamic_type'] != 'TensorList' + return value_has_tensors(v) and v['dynamic_type'] not in ['TensorList', 'const c10::List> &'] # for each aten type, how do we handle a return value of that type? @@ -208,7 +208,7 @@ def self_as_first_argument(arguments): def get_num_inputs(o): args = 0 for a in o['arguments']: - if a['type'] == 'TensorList': + if a['type'] in ['TensorList', 'const c10::List> &']: return '*' elif value_has_tensors(a): args += 1 @@ -236,11 +236,11 @@ def emit_assignments(o, env): decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader) factory_methods = find_factory_methods(decls) filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)] - top_env = { + top_env: Dict[str, List] = { 'mappings': [], 'implementations': [], 'cases': [], - } # type: Dict[str, List] + } seen: Set[str] = set() key = 0 for o in filtered: @@ -277,10 +277,10 @@ def emit_assignments(o, env): # e.g. "Float" is at::kFloat assert('Type' in o['method_of']) - static_tensor_inputs = sum(arg['type'] != 'TensorList' and value_is_tensor_type(arg) for arg in o['arguments']) - has_tensorlist = any(arg['type'] == 'TensorList' for arg in o['arguments']) + static_tensor_inputs = sum(arg['type'] not in ['TensorList', 'const c10::List> &'] and value_is_tensor_type(arg) for arg in o['arguments']) + has_tensorlist = any(arg['type'] in ['TensorList', 'const c10::List> &'] for arg in o['arguments']) if has_tensorlist: - tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] == 'TensorList'][0] + tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] in ['TensorList', 'const c10::List> &']][0] real_inputs = 0 for i, arg in enumerate(o['arguments']): @@ -290,10 +290,16 @@ def emit_assignments(o, env): view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs if arg['type'] == 'TensorList': # NOTE: do not advance real_inputs here. After this we will - # switch to indexing the "stack" from the end as if we only had + # switch to indexing the "stack" from the end env['statements'].append( 'auto {} = peekSlice({}, InputSize() - {}, InputSize());' .format(arg['name'], real_inputs, static_tensor_inputs)) + elif arg['type'] == 'const c10::List> &': + # NOTE: do not advance real_inputs here. After this we will + # switch to indexing the "stack" from the end + env['statements'].append( + 'auto {} = peekSliceOptionals({}, InputSize() - {}, InputSize());' + .format(arg['name'], real_inputs, static_tensor_inputs)) elif value_is_tensor_type(arg): # load tensor inputs from Caffe2 env['statements'].append( diff --git a/caffe2/contrib/fakelowp/test/test_chunking.py b/caffe2/contrib/fakelowp/test/test_chunking.py new file mode 100644 index 000000000000..306b5c3b3f02 --- /dev/null +++ b/caffe2/contrib/fakelowp/test/test_chunking.py @@ -0,0 +1,142 @@ +# Must happen before importing caffe2.python.* +import caffe2.python.fakelowp.init_shared_libs # noqa +import datetime +import numpy as np +from hypothesis import given, settings, example +from hypothesis import strategies as st +from caffe2.python import core, workspace +from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net +from caffe2.python.fakelowp.test_utils import print_test_debug_info +import caffe2.python.serialized_test.serialized_test_util as serial + +# Test that parallel chunks behave the same way as the serial one + +workspace.GlobalInit( + [ + "caffe2", + "--glow_global_fp16=1", + "--glow_global_fused_scale_offset_fp16=1", + "--glow_global_force_sls_fp16_accum=1", + "--glow_nnpi_num_parallel_chunks=2", + "--glow_use_dag_optimizer=false", + "--glow_dump_graph=true", + ] +) + +class Fusions(serial.SerializedTestCase): + def _get_scale_zp(self, tensor): + tensor_max = np.max(tensor) + tensor_min = min(0, np.min(tensor)) + scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0)) + if scale < 1e-6: + scale = 1e-6 + zero_point = 0 - tensor_min / scale + zero_point = int(round(np.clip(zero_point, 0, 255.0))) + return (scale, zero_point) + + @given( + scale=st.floats(1e-4, 1e2), + zp=st.integers(-128, 128), + rand_seed=st.integers(0, 65534), + m=st.integers(32, 64), + k=st.integers(1000, 6000), + n=st.integers(200, 600), + ) + # @example(m=64, k=5423, n=553, scale=1e-3, zp=120, rand_seed=1) + @settings(deadline=datetime.timedelta(seconds=1000), max_examples=1) + def test_ParallelFC(self, m, k, n, scale, zp, rand_seed): + np.random.seed(rand_seed) + workspace.ResetWorkspace() + + # Y = W_T * X + b + X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \ + .astype(np.float32) + + W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32) + b_fp32 = np.zeros((n,), dtype=np.float32) + + X_scale, X_zero_point = self._get_scale_zp(X_fp32) + + workspace.FeedBlob("X", X_fp32) + workspace.FeedBlob("W", W_fp32) + workspace.FeedBlob("b", b_fp32) + + workspace.RunOperatorOnce( + core.CreateOperator( + "Int8FCPackWeight", + ["W"], + ["W_int8"], + engine="DNNLOWP", + save_unpacked_weights=True, + in_scale=X_scale, + ) + ) + + ref_net = core.Net("net") + ref_net.Int8QuantizeNNPI( + ["X"], + ["X_int8"], + Y_scale=X_scale, + Y_zero_point=X_zero_point + ) + ref_net.Int8FCFakeAcc32NNPI( + ["X_int8", "W_int8", "b"], + ["Y_int8"], + Y_scale=X_scale, + Y_zero_point=X_zero_point, + ) + ref_net.Int8Relu( + ["Y_int8"], + ["Y_relu"], + Y_zero_point=X_zero_point, + Y_scale=X_scale, + ) + ref_net.Int8DequantizeNNPI( + ["Y_relu"], + ["Y"] + ) + ref_net.Proto().external_output.append("Y") + + # run ref_net + workspace.RunNetOnce(ref_net) + Y_fbgemm = workspace.FetchBlob("Y") + + # run onnxifi net + ref_net.Proto().op[0].type = "Int8Quantize" + ref_net.Proto().op[1].type = "Int8FC" + ref_net.Proto().op[2].type = "Int8Relu" + ref_net.Proto().op[3].type = "Int8Dequantize" + net_onnxified = onnxifi_caffe2_net( + ref_net.Proto(), + {}, + debug=True, + adjust_batch=False, + use_onnx=False, + weight_names=["W_int8", "b"], + ) + num_onnxified_ops = sum( + 1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op + ) + print(net_onnxified) + np.testing.assert_equal(num_onnxified_ops, 1) + workspace.CreateNet(net_onnxified) + workspace.RunNet(net_onnxified.name) + Y_glow = workspace.FetchBlob("Y") + + if not np.allclose(Y_glow, Y_fbgemm): + diff_Y = np.abs(Y_glow - Y_fbgemm) + print_test_debug_info( + "int8_fc", + { + "seed": rand_seed, + "n": n, + "X": X_fp32, + "W": W_fp32, + "b": b_fp32, + "Y_fbgemm": Y_fbgemm, + "Y_glow": Y_glow, + "diff": diff_Y, + "maxdiff": diff_Y.max(axis=1), + }, + ) + assert 0 diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py index 335159c8318e..3e22d7c5937b 100644 --- a/caffe2/contrib/fakelowp/test/test_fusions.py +++ b/caffe2/contrib/fakelowp/test/test_fusions.py @@ -27,7 +27,7 @@ class Fusions(serial.SerializedTestCase): rand_seed=st.integers(0, 65534), ) @settings(deadline=datetime.timedelta(seconds=10)) - def Skip_test_tanhquantize(self, scale, zp, size, rand_seed): + def test_tanhquantize(self, scale, zp, size, rand_seed): np.random.seed(rand_seed) workspace.ResetWorkspace() diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py index fbca9b8fe64c..5ae066f5e3ca 100644 --- a/caffe2/contrib/gloo/gloo_test.py +++ b/caffe2/contrib/gloo/gloo_test.py @@ -27,7 +27,6 @@ op_engine = 'GLOO' - class TemporaryDirectory: def __enter__(self): self.tmpdir = tempfile.mkdtemp() diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py index d6754adc20fd..32b9ec34d1f8 100644 --- a/caffe2/python/_import_c_extension.py +++ b/caffe2/python/_import_c_extension.py @@ -5,16 +5,6 @@ import sys from caffe2.python import extension_loader -# NOTE: we have to import python protobuf here **before** we load cpp extension. -# Otherwise it breaks under certain build conditions if cpp implementation of -# protobuf is used. Presumably there's some registry in protobuf library and -# python side has to initialize the dictionary first, before static -# initialization in python extension does so. Otherwise, duplicated protobuf -# descriptors will be created and it can lead to obscure errors like -# "Parameter to MergeFrom() must be instance of same class: -# expected caffe2.NetDef got caffe2.NetDef." -import caffe2.proto - # We will first try to load the gpu-enabled caffe2. If it fails, we will then # attempt to load the cpu version. The cpu backend is the minimum required, so # if that still fails, we will exit loud. diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py index 1b683be0d51e..b4cb8f2da0b4 100644 --- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py +++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py @@ -5,7 +5,7 @@ import hypothesis.strategies as st import numpy as np -from caffe2.python import core, dyndep, workspace +from caffe2.python import core, workspace def benchmark_sparse_lengths_sum( diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py index 18033661a69e..b4b37811de10 100644 --- a/caffe2/python/convert.py +++ b/caffe2/python/convert.py @@ -5,6 +5,3 @@ -from caffe2.proto import caffe2_pb2, torch_pb2 - -import caffe2.python._import_c_extension as C diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py index a1dc52aad2d9..d9d82bf5e6c4 100644 --- a/caffe2/python/convert_test.py +++ b/caffe2/python/convert_test.py @@ -3,10 +3,8 @@ -from caffe2.python import convert, workspace -from caffe2.proto import caffe2_pb2, torch_pb2 +from caffe2.python import workspace import unittest -import numpy as np class TestOperator(unittest.TestCase): def setUp(self): diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index 3674b7aa4585..293eccca0dd4 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -3,7 +3,6 @@ -from future.utils import bytes_to_native_str from hypothesis import given, settings import hypothesis.strategies as st import unittest diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py index 0c45fb50aed9..ac1c72284fbf 100644 --- a/caffe2/python/dataio_test.py +++ b/caffe2/python/dataio_test.py @@ -6,7 +6,6 @@ from caffe2.python.dataio import ( CompositeReader, CompositeReaderBuilder, - Reader, ReaderBuilder, ReaderWithDelay, ReaderWithLimit, @@ -29,7 +28,6 @@ import shutil import unittest import tempfile -import time def make_source_dataset(ws, size=100, offset=0, name=None): diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py index ae4473ea4864..7c5a0026c113 100644 --- a/caffe2/python/ideep/conv_op_test.py +++ b/caffe2/python/ideep/conv_op_test.py @@ -4,7 +4,6 @@ import unittest -import sys import hypothesis.strategies as st from hypothesis import given, settings import numpy as np diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py index 18ce574b623b..a0a782ab8a03 100644 --- a/caffe2/python/ideep/convfusion_op_test.py +++ b/caffe2/python/ideep/convfusion_op_test.py @@ -5,8 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings -import copy +from hypothesis import given import numpy as np import math from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py index 33b0a52a7421..5b07333758dd 100644 --- a/caffe2/python/ideep/dropout_op_test.py +++ b/caffe2/python/ideep/dropout_op_test.py @@ -7,8 +7,6 @@ from hypothesis import given import hypothesis.strategies as st import numpy as np - -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py index a259e01bab10..39ede0d214fe 100644 --- a/caffe2/python/ideep/order_switch_op_test.py +++ b/caffe2/python/ideep/order_switch_op_test.py @@ -10,7 +10,6 @@ import caffe2.python.ideep_test_util as mu from hypothesis import given, settings -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py index 47114832f85d..1beb24bc8803 100644 --- a/caffe2/python/ideep/shape_op_test.py +++ b/caffe2/python/ideep/shape_op_test.py @@ -7,7 +7,6 @@ import hypothesis.strategies as st from hypothesis import given, settings import numpy as np -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py index 618a0e7fbfc3..97efafa72057 100644 --- a/caffe2/python/ideep/spatial_bn_op_test.py +++ b/caffe2/python/ideep/spatial_bn_op_test.py @@ -7,9 +7,8 @@ import hypothesis.strategies as st import numpy as np import unittest -from caffe2.python import brew, core, workspace +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu -from caffe2.python.model_helper import ModelHelper import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py index aa1c5bc260fa..42feeed00122 100644 --- a/caffe2/python/ideep/test_ideep_net.py +++ b/caffe2/python/ideep/test_ideep_net.py @@ -9,7 +9,6 @@ import numpy as np import argparse import time -import os.path def GetArgumentParser(): diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py index 962d4051718b..2d0f35a7406f 100644 --- a/caffe2/python/ideep/transform_ideep_net.py +++ b/caffe2/python/ideep/transform_ideep_net.py @@ -6,7 +6,6 @@ import argparse import copy import json -import os.path import numpy as np diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py index 8b324ed964ae..f8b784822a07 100644 --- a/caffe2/python/ideep/transpose_op_test.py +++ b/caffe2/python/ideep/transpose_op_test.py @@ -7,7 +7,6 @@ import hypothesis.strategies as st from hypothesis import given, settings import numpy as np -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py index 7129ed14ba74..0cc643317c93 100644 --- a/caffe2/python/ideep_test_util.py +++ b/caffe2/python/ideep_test_util.py @@ -14,7 +14,6 @@ import hypothesis.strategies as st from caffe2.proto import caffe2_pb2 -from caffe2.python import workspace from caffe2.python import hypothesis_test_util as hu cpu_do = hu.cpu_do diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py index 9d825f3827b9..6a5a3c82dd30 100644 --- a/caffe2/python/layer_model_helper.py +++ b/caffe2/python/layer_model_helper.py @@ -17,7 +17,6 @@ from caffe2.python.optimizer import get_param_device, Optimizer from caffe2.python.regularizer import Regularizer, RegularizationBy from caffe2.python.layers import layers -from caffe2.proto import caffe2_pb2 from future.utils import viewitems, viewvalues import logging diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py index 2b084bea591b..fddb20e6bb14 100644 --- a/caffe2/python/mkl/mkl_LRN_op_test.py +++ b/caffe2/python/mkl/mkl_LRN_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py index ae42902d9102..c192137dc28c 100644 --- a/caffe2/python/mkl/mkl_LRN_speed_test.py +++ b/caffe2/python/mkl/mkl_LRN_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py index f1fe7b062318..74c4f2c6cde9 100644 --- a/caffe2/python/mkl/mkl_conv_op_test.py +++ b/caffe2/python/mkl/mkl_conv_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py index 01786d55c337..180d93f26570 100644 --- a/caffe2/python/mkl/mkl_fc_op_test.py +++ b/caffe2/python/mkl/mkl_fc_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py index 85f5605e9676..243e49c2f8f8 100644 --- a/caffe2/python/mkl/mkl_fc_speed_test.py +++ b/caffe2/python/mkl/mkl_fc_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py index 26a9b7131b0b..f233275786f7 100644 --- a/caffe2/python/mkl/mkl_fill_op_test.py +++ b/caffe2/python/mkl/mkl_fill_op_test.py @@ -5,8 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings -import numpy as np +from hypothesis import given from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.mkl_test_util as mu diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py index b25e0f915cc7..aa43aed97a09 100644 --- a/caffe2/python/mkl/mkl_pool_speed_test.py +++ b/caffe2/python/mkl/mkl_pool_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py index 2ac9080ce670..86856b130d63 100644 --- a/caffe2/python/mkl/mkl_sbn_op_test.py +++ b/caffe2/python/mkl/mkl_sbn_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py index 3b3b71d1c997..05885ceca575 100644 --- a/caffe2/python/mkl/mkl_sbn_speed_test.py +++ b/caffe2/python/mkl/mkl_sbn_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py index 9a7310a484d1..ab2e4428519a 100644 --- a/caffe2/python/mkl/mkl_speed_test.py +++ b/caffe2/python/mkl/mkl_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index 3a88a3deeccc..b52501584064 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -6,7 +6,6 @@ import copy from caffe2.proto import caffe2_pb2 from caffe2.python import core -import caffe2.python._import_c_extension as C def rewrite_init_net_simple(net): diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 3d9adc696486..bd9d10fcbae1 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -3,7 +3,7 @@ -from caffe2.python import core, workspace, test_util +from caffe2.python import core, test_util from caffe2.proto import caffe2_pb2 import caffe2.python.nomnigraph as ng diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 5d445576b32c..2c80fadafaee 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -5,14 +5,7 @@ To run this, you will need to have Caffe2 installed as well. """ - - - - - -import os import collections -from subprocess import Popen, PIPE import sys import zipfile import itertools @@ -23,8 +16,6 @@ # importing onnx first, which will cause it to go out and pick up the # system protobuf. import onnx.backend - -import caffe2 from caffe2.python import core, workspace, rnn_cell, gru_cell from caffe2.python.compatibility import container_abcs from caffe2.python.model_helper import ModelHelper @@ -32,7 +23,7 @@ import caffe2.python.utils import numpy as np import onnx -from onnx import checker, GraphProto, TensorProto, AttributeProto, ModelProto +from onnx import TensorProto import onnx.numpy_helper import onnx.defs import onnx.optimizer @@ -42,7 +33,6 @@ from caffe2.python.onnx.workspace import Workspace from caffe2.python.onnx.backend_rep import Caffe2Rep -from caffe2.python.onnx.backend_cpp_rep import Caffe2CppRep import caffe2.python._import_c_extension as C diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py index 126eef8a8470..7e469e514a73 100644 --- a/caffe2/python/onnx/bin/conversion.py +++ b/caffe2/python/onnx/bin/conversion.py @@ -9,8 +9,7 @@ from caffe2.proto import caffe2_pb2 import click -import numpy as np -from onnx import checker, ModelProto +from onnx import ModelProto from caffe2.python.onnx.backend import Caffe2Backend as c2 import caffe2.python.onnx.frontend as c2_onnx diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index ee3c30949ff7..bb2778d1a991 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -17,15 +17,12 @@ from caffe2.python import core as caffe2_core from caffe2.python.compatibility import container_abcs -from caffe2.proto import caffe2_legacy_pb2 -from enum import Enum -from onnx import (defs, checker, helper, numpy_helper, mapping, - ModelProto, GraphProto, NodeProto, AttributeProto, TensorProto, OperatorSetIdProto) -from onnx.helper import make_tensor, make_tensor_value_info, make_attribute, make_model +from onnx import (checker, helper, numpy_helper, mapping, + GraphProto, NodeProto, TensorProto, OperatorSetIdProto) +from onnx.helper import make_tensor_value_info, make_model import numpy as np from caffe2.python.onnx.helper import c2_native_run_net -from caffe2.python.onnx.error import Unsupported import caffe2.python._import_c_extension as C diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py index 7f8f1a6d346a..6e73a5d5c95d 100644 --- a/caffe2/python/onnx/helper.py +++ b/caffe2/python/onnx/helper.py @@ -9,9 +9,6 @@ from onnx.backend.base import namedtupledict from caffe2.python.onnx.workspace import Workspace -import caffe2.python._import_c_extension as C - -import io import logging import time diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py index a04e7e4554b9..3e67c4948b1f 100644 --- a/caffe2/python/onnx/onnxifi.py +++ b/caffe2/python/onnx/onnxifi.py @@ -11,9 +11,7 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace import caffe2.python._import_c_extension as C -import numpy as np def onnxifi_caffe2_net( diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py index 7eafccaec9e4..4316149d5bf6 100644 --- a/caffe2/python/onnx/test_onnxifi.py +++ b/caffe2/python/onnx/test_onnxifi.py @@ -3,16 +3,14 @@ -import json import numpy as np -import os import time import unittest import onnx import onnx.defs from onnx.backend.base import namedtupledict -from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model +from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace from caffe2.python.models.download import ModelDownloader diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index d2efcc79823e..aab5a04a169c 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -6,7 +6,6 @@ -import json import os import unittest @@ -17,7 +16,7 @@ from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op -from onnx import defs, mapping +from onnx import mapping import caffe2.python.onnx.frontend as c2_onnx import caffe2.python.onnx.backend as c2 diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py index 5166ec3c5083..e8b718a5a2be 100644 --- a/caffe2/python/onnx/tests/onnx_backend_test.py +++ b/caffe2/python/onnx/tests/onnx_backend_test.py @@ -13,7 +13,7 @@ import caffe2.python.onnx.backend as c2 -from caffe2.python import core, workspace +from caffe2.python import core core.SetEnginePref({}, {}) # This is a pytest magic variable to load extra plugins diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py index d34d4a0e5287..96f954037178 100644 --- a/caffe2/python/onnx/tests/ssa_test.py +++ b/caffe2/python/onnx/tests/ssa_test.py @@ -7,11 +7,10 @@ import copy -import onnx import numpy as np from caffe2.proto import caffe2_pb2 from caffe2.python import core -from onnx import helper, TensorProto +from onnx import TensorProto import caffe2.python.onnx.frontend as c2_onnx from caffe2.python.onnx.helper import c2_native_run_net diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py index d224daf05ba3..bebfc1012957 100644 --- a/caffe2/python/onnx/tests/test_utils.py +++ b/caffe2/python/onnx/tests/test_utils.py @@ -6,7 +6,6 @@ -import os import unittest import numpy as np diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py index 3a1ebcd4ec67..f039ef09f637 100644 --- a/caffe2/python/operator_fp_exceptions_test.py +++ b/caffe2/python/operator_fp_exceptions_test.py @@ -3,7 +3,6 @@ from caffe2.python import core, workspace -from caffe2.proto import caffe2_pb2 from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py index 6cf8170b34f8..88197d16d70b 100644 --- a/caffe2/python/operator_test/blobs_queue_db_test.py +++ b/caffe2/python/operator_test/blobs_queue_db_test.py @@ -3,7 +3,6 @@ -import unittest import numpy as np import caffe2.proto.caffe2_pb2 as caffe2_pb2 diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 05b8212242e4..38fe43899990 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -2,7 +2,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py index bf9af112a5b0..2eb2acf87902 100644 --- a/caffe2/python/operator_test/bucketize_op_test.py +++ b/caffe2/python/operator_test/bucketize_op_test.py @@ -2,10 +2,9 @@ -from caffe2.python import core, dyndep +from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu -import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py index 1927b4eac78f..ac83681f08bf 100644 --- a/caffe2/python/operator_test/concat_split_op_test.py +++ b/caffe2/python/operator_test/concat_split_op_test.py @@ -3,8 +3,7 @@ -from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace +from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given, settings diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index ae54cd37a91d..e600aa2c9ee9 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -2,7 +2,6 @@ import collections import functools -import os import unittest import caffe2.python._import_c_extension as C diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py index 04bfbbe6f4f6..d979407321a4 100644 --- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py +++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py @@ -3,7 +3,6 @@ -from hypothesis import given import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py index b75e7b7b1a10..4d7b90c431a6 100644 --- a/caffe2/python/operator_test/crf_test.py +++ b/caffe2/python/operator_test/crf_test.py @@ -9,7 +9,6 @@ import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st from hypothesis import given, settings -import unittest class TestCRFOp(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py index d1852e7dd9e8..c88f93503a15 100644 --- a/caffe2/python/operator_test/cross_entropy_ops_test.py +++ b/caffe2/python/operator_test/cross_entropy_ops_test.py @@ -9,7 +9,6 @@ import numpy as np import unittest -import os def sigmoid(x): return 1.0 / (1.0 + np.exp(-x)) diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py index 1dda7166e65a..29440c00a4b3 100644 --- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from caffe2.python.test_util import caffe2_flaky from collections import defaultdict, Counter from hypothesis import given, settings import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py index db1b826cfe41..ef4433a41a18 100644 --- a/caffe2/python/operator_test/cudnn_recurrent_test.py +++ b/caffe2/python/operator_test/cudnn_recurrent_test.py @@ -4,7 +4,6 @@ from caffe2.python import model_helper, workspace, core, rnn_cell -from caffe2.proto import caffe2_pb2 from future.utils import viewitems import numpy as np diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py index f6ad0e38e73c..67289de5e924 100644 --- a/caffe2/python/operator_test/deform_conv_test.py +++ b/caffe2/python/operator_test/deform_conv_test.py @@ -1,6 +1,5 @@ -import os import unittest import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py index 2d6d6429f833..cdfffce288dd 100644 --- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py +++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py @@ -5,7 +5,7 @@ import numpy as np import caffe2.python.hypothesis_test_util as hu -from caffe2.python import core, dyndep, utils, workspace +from caffe2.python import core, utils from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py index e948fdae9673..5b46548e072b 100644 --- a/caffe2/python/operator_test/distance_op_test.py +++ b/caffe2/python/operator_test/distance_op_test.py @@ -6,7 +6,6 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial -from hypothesis import given import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py index ac0dc3dd0975..2bd85625a3d9 100644 --- a/caffe2/python/operator_test/elementwise_linear_op_test.py +++ b/caffe2/python/operator_test/elementwise_linear_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index 8dbfdc1871e8..31f70086de7b 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -10,7 +10,6 @@ import numpy as np import unittest -import os class TestElementwiseOps(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py index b843bfdc95b9..8150977945a2 100644 --- a/caffe2/python/operator_test/enforce_finite_op_test.py +++ b/caffe2/python/operator_test/enforce_finite_op_test.py @@ -8,7 +8,6 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu -import hypothesis.strategies as st class TestEnforceFinite(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py index 0d198b1aff14..aba2c1106da3 100644 --- a/caffe2/python/operator_test/expand_op_test.py +++ b/caffe2/python/operator_test/expand_op_test.py @@ -3,7 +3,7 @@ -from caffe2.python import core, workspace +from caffe2.python import core from hypothesis import given, settings import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py index 19fa329c9389..5a20b63166be 100644 --- a/caffe2/python/operator_test/feature_maps_ops_test.py +++ b/caffe2/python/operator_test/feature_maps_ops_test.py @@ -2,7 +2,7 @@ -from caffe2.python import core, workspace, dyndep +from caffe2.python import core, workspace from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py index f38df09ec9fb..7b7a33dcd90a 100644 --- a/caffe2/python/operator_test/glu_op_test.py +++ b/caffe2/python/operator_test/glu_op_test.py @@ -6,7 +6,7 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial -from hypothesis import assume, given, settings, HealthCheck +from hypothesis import given, settings import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py index 62aba236d5ba..8e864bb42152 100644 --- a/caffe2/python/operator_test/group_conv_test.py +++ b/caffe2/python/operator_test/group_conv_test.py @@ -12,7 +12,6 @@ import caffe2.python.hypothesis_test_util as hu import unittest -import os class TestGroupConvolution(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py index 99444f39ac26..1a7db2634989 100644 --- a/caffe2/python/operator_test/gru_test.py +++ b/caffe2/python/operator_test/gru_test.py @@ -16,7 +16,6 @@ import hypothesis.strategies as st import numpy as np import unittest -import os def gru_unit(*args, **kwargs): diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py index 90a8197e7ccf..c0a1e8f49f5a 100644 --- a/caffe2/python/operator_test/hyperbolic_ops_test.py +++ b/caffe2/python/operator_test/hyperbolic_ops_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py index 760228382bc6..42cb1deaf8ae 100644 --- a/caffe2/python/operator_test/im2col_col2im_test.py +++ b/caffe2/python/operator_test/im2col_col2im_test.py @@ -10,9 +10,6 @@ import hypothesis.strategies as st import numpy as np -import unittest -import os - class TestReduceFrontSum(hu.HypothesisTestCase): @given(batch_size=st.integers(1, 3), diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py index fb4f3c935ba8..efce9d7001fe 100644 --- a/caffe2/python/operator_test/instance_norm_test.py +++ b/caffe2/python/operator_test/instance_norm_test.py @@ -11,7 +11,6 @@ import caffe2.python.serialized_test.serialized_test_util as serial import unittest -import os class TestInstanceNorm(serial.SerializedTestCase): diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py index 6ed2db2e88c2..f205d8e650b2 100644 --- a/caffe2/python/operator_test/jsd_ops_test.py +++ b/caffe2/python/operator_test/jsd_ops_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index 62e94afe9e7d..d402cce4c4f9 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -13,7 +13,6 @@ import hypothesis.strategies as st import numpy as np -import os import torch import unittest diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py index 626ec0542b7d..cda2f7da323e 100644 --- a/caffe2/python/operator_test/lengths_pad_op_test.py +++ b/caffe2/python/operator_test/lengths_pad_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py index fc4e89e2545b..49b0ba7ec22c 100644 --- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py +++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py @@ -3,7 +3,7 @@ import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st import numpy as np -from caffe2.python import core, dyndep, workspace +from caffe2.python import core, workspace from hypothesis import given diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index e0a5f9609588..441fcc747835 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py index 24cb65ac96f8..f6a07ead3cf9 100644 --- a/caffe2/python/operator_test/loss_ops_test.py +++ b/caffe2/python/operator_test/loss_ops_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py index b8cef19b24df..8b4001a574ac 100644 --- a/caffe2/python/operator_test/matmul_op_test.py +++ b/caffe2/python/operator_test/matmul_op_test.py @@ -9,8 +9,6 @@ from hypothesis import assume, given, settings import hypothesis.strategies as st - -from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py index 5830089f8e9b..ee2c6fc8fbf7 100644 --- a/caffe2/python/operator_test/mean_op_test.py +++ b/caffe2/python/operator_test/mean_op_test.py @@ -6,8 +6,6 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial - -from hypothesis import given import hypothesis.strategies as st import numpy as np import unittest diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py index 3b270df254ce..bee44e360e3f 100644 --- a/caffe2/python/operator_test/moments_op_test.py +++ b/caffe2/python/operator_test/moments_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py index a202581f808c..c32aa99470db 100644 --- a/caffe2/python/operator_test/numpy_tile_op_test.py +++ b/caffe2/python/operator_test/numpy_tile_op_test.py @@ -9,7 +9,7 @@ import hypothesis.strategies as st import unittest -from caffe2.python import core, workspace +from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py index 4cff53b87d6e..5ad9c277239d 100644 --- a/caffe2/python/operator_test/onnx_while_test.py +++ b/caffe2/python/operator_test/onnx_while_test.py @@ -3,7 +3,7 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace +from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given, settings diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py index 9a76e6b847a5..eceb1e5ba6a9 100644 --- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py +++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py index 6d4e6bbdcd08..788c4035dd5f 100644 --- a/caffe2/python/operator_test/pad_test.py +++ b/caffe2/python/operator_test/pad_test.py @@ -5,8 +5,6 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial - -from hypothesis import given import hypothesis.strategies as st import numpy as np import unittest diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py index d81b0a963185..40c4192e21e9 100644 --- a/caffe2/python/operator_test/percentile_op_test.py +++ b/caffe2/python/operator_test/percentile_op_test.py @@ -3,7 +3,7 @@ -from caffe2.python import core, workspace, dyndep +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import numpy as np diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py index e244f77149e1..a702ab41577f 100644 --- a/caffe2/python/operator_test/rand_quantization_op_test.py +++ b/caffe2/python/operator_test/rand_quantization_op_test.py @@ -6,7 +6,6 @@ import numpy as np import struct import unittest -import os from hypothesis import given, example import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py index 13650e6cad4e..33ada4d6881c 100644 --- a/caffe2/python/operator_test/recurrent_network_test.py +++ b/caffe2/python/operator_test/recurrent_network_test.py @@ -11,9 +11,6 @@ import hypothesis.strategies as st import numpy as np -import os -import unittest - class RecurrentNetworkTest(serial.SerializedTestCase): @given(T=st.integers(1, 4), n=st.integers(1, 5), diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py index 727631befe89..7b79b3b81aed 100644 --- a/caffe2/python/operator_test/reduce_ops_test.py +++ b/caffe2/python/operator_test/reduce_ops_test.py @@ -11,7 +11,6 @@ import hypothesis.strategies as st import numpy as np import itertools as it -import unittest class TestReduceOps(serial.SerializedTestCase): diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py index 7d4287df6609..6a99f2b27d42 100644 --- a/caffe2/python/operator_test/reduction_ops_test.py +++ b/caffe2/python/operator_test/reduction_ops_test.py @@ -3,7 +3,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace from hypothesis import assume, given, settings import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py index c74157a039b0..ea835acead61 100644 --- a/caffe2/python/operator_test/roi_align_rotated_op_test.py +++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py @@ -3,7 +3,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 4609473f91f0..65c0669abfb0 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -11,7 +11,6 @@ import hypothesis.strategies as st import numpy as np import unittest -import os def _gen_test_add_padding(with_pad_data=True, diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py index 35f7bd2a5e29..21a530346329 100644 --- a/caffe2/python/operator_test/spatial_bn_op_test.py +++ b/caffe2/python/operator_test/spatial_bn_op_test.py @@ -3,7 +3,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import brew, core, utils, workspace import caffe2.python.hip_test_util as hiputl import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py index 5bd6cb1d08f8..51f328c95f5f 100644 --- a/caffe2/python/operator_test/square_root_divide_op_test.py +++ b/caffe2/python/operator_test/square_root_divide_op_test.py @@ -5,7 +5,6 @@ from caffe2.python import core from functools import partial -from hypothesis import given from hypothesis import strategies as st import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py index 02276b08c176..beb8a3781832 100644 --- a/caffe2/python/optimizer_test_util.py +++ b/caffe2/python/optimizer_test_util.py @@ -8,7 +8,6 @@ import unittest import numpy as np from caffe2.python import brew, core, workspace, cnn, optimizer -from caffe2.proto import caffe2_pb2 from caffe2.python.modeling.initializers import ( Initializer, PseudoFP16Initializer) diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py index dee96413dbe5..34fddbc1a66e 100644 --- a/caffe2/python/rnn/lstm_comparison.py +++ b/caffe2/python/rnn/lstm_comparison.py @@ -2,7 +2,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, lstm_benchmark, utils from copy import copy diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py index 9c85d0efd2a5..f6da5e126119 100644 --- a/caffe2/python/rnn_cell.py +++ b/caffe2/python/rnn_cell.py @@ -7,7 +7,6 @@ import functools import inspect -import itertools import logging import numpy as np import random diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py index 9bd69eb32902..bf3c8e9a0d06 100644 --- a/caffe2/python/scope_test.py +++ b/caffe2/python/scope_test.py @@ -4,7 +4,6 @@ from caffe2.python import scope, core, workspace -from caffe2.proto import caffe2_pb2 import unittest import threading diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py index ba10247eaa2e..abf63626a7fa 100644 --- a/caffe2/python/test/executor_test_util.py +++ b/caffe2/python/test/executor_test_util.py @@ -14,7 +14,6 @@ import time import numpy as np -from hypothesis import settings CI_MAX_EXAMPLES = 2 diff --git a/caffe2/python/test/inference_lstm_op_test.py b/caffe2/python/test/inference_lstm_op_test.py index 20caab9ba78b..768827bd8876 100644 --- a/caffe2/python/test/inference_lstm_op_test.py +++ b/caffe2/python/test/inference_lstm_op_test.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 -import inspect import hypothesis.strategies as st import numpy as np import torch -from caffe2.python import core, workspace +from caffe2.python import core from caffe2.python.test_util import TestCase from hypothesis import given, settings from torch import nn diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py index 7790e0f6d8f5..a407f33fe253 100644 --- a/caffe2/python/test/python_protobuf_test.py +++ b/caffe2/python/test/python_protobuf_test.py @@ -5,9 +5,6 @@ # make sure we use cpp implementation of protobuf import os os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp" - -# import cpp extension first -from caffe2.python import core # then import protobuf from caffe2.proto import caffe2_pb2, metanet_pb2 diff --git a/caffe2/python/trt/test_pt_onnx_trt.py b/caffe2/python/trt/test_pt_onnx_trt.py index 96f1ad76f6b7..5e6abb5c4d0b 100644 --- a/caffe2/python/trt/test_pt_onnx_trt.py +++ b/caffe2/python/trt/test_pt_onnx_trt.py @@ -15,17 +15,13 @@ import os import unittest -from typing import List, Any from PIL import Image import numpy as np import torch -from torch.onnx import OperatorExportTypes import torchvision.models as models import pycuda.driver as cuda -# This import causes pycuda to automatically manage CUDA context creation and cleanup. -import pycuda.autoinit import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py index 39d37ca9fa0a..2782cca7c13f 100644 --- a/caffe2/python/trt/test_trt.py +++ b/caffe2/python/trt/test_trt.py @@ -7,7 +7,7 @@ from caffe2.python import core, workspace import onnx import onnx.defs -from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model +from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model from onnx.backend.base import namedtupledict from caffe2.python.models.download import ModelDownloader import caffe2.python.onnx.backend as c2 @@ -16,7 +16,6 @@ from caffe2.python.onnx.tests.test_utils import TestCase import numpy as np import os.path -import json import time import unittest import tarfile diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py index 0936941aac03..1b201007daab 100644 --- a/caffe2/python/trt/transform.py +++ b/caffe2/python/trt/transform.py @@ -12,9 +12,7 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op -from caffe2.python import core, workspace -import caffe2.python.onnx.frontend as c2_front +from caffe2.python import workspace import caffe2.python._import_c_extension as C import numpy as np diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h index a34a6db70115..87c3151bbb76 100644 --- a/caffe2/serialize/inline_container.h +++ b/caffe2/serialize/inline_container.h @@ -12,6 +12,7 @@ #include "caffe2/serialize/istream_adapter.h" #include "caffe2/serialize/read_adapter_interface.h" +#include "caffe2/serialize/versions.h" extern "C" { typedef struct mz_zip_archive mz_zip_archive; @@ -90,68 +91,6 @@ typedef struct mz_zip_archive mz_zip_archive; namespace caffe2 { namespace serialize { -constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L; -constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L; - -// Versions (i.e. why was the version number bumped?) - -// Note [Dynamic Versions and torch.jit.save vs. torch.save] -// -// Our versioning scheme has a "produced file format version" which -// describes how an archive is to be read. The version written in an archive -// is at least this current produced file format version, but may be greater -// if it includes certain symbols. We refer to these conditional versions -// as "dynamic," since they are identified at runtime. -// -// Dynamic versioning is useful when an operator's semantics are updated. -// When using torch.jit.save we want those semantics to be preserved. If -// we bumped the produced file format version on every change, however, -// then older versions of PyTorch couldn't read even simple archives, like -// a single tensor, from newer versions of PyTorch. Instead, we -// assign dynamic versions to these changes that override the -// produced file format version as needed. That is, when the semantics -// of torch.div changed it was assigned dynamic version 4, and when -// torch.jit.saving modules that use torch.div those archives also have -// (at least) version 4. This prevents earlier versions of PyTorch -// from accidentally performing the wrong kind of division. Modules -// that don't use torch.div or other operators with dynamic versions -// can write the produced file format version, and these programs will -// run as expected on earlier versions of PyTorch. -// -// While torch.jit.save attempts to preserve operator semantics, -// torch.save does not. torch.save is analogous to pickling Python, so -// a function that uses torch.div will have different behavior if torch.saved -// and torch.loaded across PyTorch versions. From a technical perspective, -// torch.save ignores dynamic versioning. - -// 1. Initial version -// 2. Removed op_version_set version numbers -// 3. Added type tags to pickle serialization of container types -// 4. (Dynamic) Stopped integer division using torch.div -// (a versioned symbol preserves the historic behavior of versions 1--3) -// 5. (Dynamic) Stops torch.full inferring a floating point dtype -// when given bool or integer fill values. -constexpr uint64_t kProducedFileFormatVersion = 0x3L; - -// the version we write when the archive contains bytecode. -// It must be higher or eq to kProducedFileFormatVersion. -// Because torchscript changes is likely introduce bytecode change. -// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion -// should be increased too. The relationship is: -// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion -// >= kProducedFileFormatVersion -constexpr uint64_t kProducedBytecodeVersion = 0x4L; - -static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion, - "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion."); - -// Introduce kMinSupportedBytecodeVersion for limited backward compatibility -// support of bytecode. If -// kMinSupportedBytecodeVersion <= model_version <= kProducedBytecodeVersion (in loader), -// we should support this model_version. For example, we provide a wrapper to -// handle an updated operator. -constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L; - class TORCH_API PyTorchStreamReader final { public: explicit PyTorchStreamReader(const std::string& file_name); diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h new file mode 100644 index 000000000000..4da4b2c50305 --- /dev/null +++ b/caffe2/serialize/versions.h @@ -0,0 +1,68 @@ +#pragma once + +namespace caffe2 { +namespace serialize { + +constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L; +constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L; + +// Versions (i.e. why was the version number bumped?) + +// Note [Dynamic Versions and torch.jit.save vs. torch.save] +// +// Our versioning scheme has a "produced file format version" which +// describes how an archive is to be read. The version written in an archive +// is at least this current produced file format version, but may be greater +// if it includes certain symbols. We refer to these conditional versions +// as "dynamic," since they are identified at runtime. +// +// Dynamic versioning is useful when an operator's semantics are updated. +// When using torch.jit.save we want those semantics to be preserved. If +// we bumped the produced file format version on every change, however, +// then older versions of PyTorch couldn't read even simple archives, like +// a single tensor, from newer versions of PyTorch. Instead, we +// assign dynamic versions to these changes that override the +// produced file format version as needed. That is, when the semantics +// of torch.div changed it was assigned dynamic version 4, and when +// torch.jit.saving modules that use torch.div those archives also have +// (at least) version 4. This prevents earlier versions of PyTorch +// from accidentally performing the wrong kind of division. Modules +// that don't use torch.div or other operators with dynamic versions +// can write the produced file format version, and these programs will +// run as expected on earlier versions of PyTorch. +// +// While torch.jit.save attempts to preserve operator semantics, +// torch.save does not. torch.save is analogous to pickling Python, so +// a function that uses torch.div will have different behavior if torch.saved +// and torch.loaded across PyTorch versions. From a technical perspective, +// torch.save ignores dynamic versioning. + +// 1. Initial version +// 2. Removed op_version_set version numbers +// 3. Added type tags to pickle serialization of container types +// 4. (Dynamic) Stopped integer division using torch.div +// (a versioned symbol preserves the historic behavior of versions 1--3) +// 5. (Dynamic) Stops torch.full inferring a floating point dtype +// when given bool or integer fill values. +constexpr uint64_t kProducedFileFormatVersion = 0x3L; + +// the version we write when the archive contains bytecode. +// It must be higher or eq to kProducedFileFormatVersion. +// Because torchscript changes is likely introduce bytecode change. +// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion +// should be increased too. The relationship is: +// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion +// >= kProducedFileFormatVersion +constexpr uint64_t kProducedBytecodeVersion = 0x4L; + +static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion, + "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion."); + +// Introduce kMinSupportedBytecodeVersion for limited backward compatibility +// support of bytecode. If +// kMinSupportedBytecodeVersion <= model_version <= kProducedBytecodeVersion (in loader), +// we should support this model_version. For example, we provide a wrapper to +// handle an updated operator. +constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L; +} // namespace serialize +} // namespace caffe2 diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst index a389de60416a..1cac90ffab86 100644 --- a/docs/source/quantization.rst +++ b/docs/source/quantization.rst @@ -530,6 +530,71 @@ Best Practices ``fbgemm`` backend. This argument prevents overflow on some int8 instructions by reducing the range of quantized data type by 1 bit. +Common Errors +--------------------------------------- + +Passing a non-quantized Tensor into a quantized kernel +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you see an error similar to:: + + RuntimeError: Could not run 'quantized::some_operator' with arguments from the 'CPU' backend... + +This means that you are trying to pass a non-quantized Tensor to a quantized +kernel. A common workaround is to use ``torch.quantization.QuantStub`` to +quantize the tensor. This needs to be done manually in Eager mode quantization. +An e2e example:: + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.conv = torch.nn.Conv2d(1, 1, 1) + + def forward(self, x): + # during the convert step, this will be replaced with a + # `quantize_per_tensor` call + x = self.quant(x) + x = self.conv(x) + return x + +Passing a quantized Tensor into a non-quantized kernel +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you see an error similar to:: + + RuntimeError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend. + +This means that you are trying to pass a quantized Tensor to a non-quantized +kernel. A common workaround is to use ``torch.quantization.DeQuantStub`` to +dequantize the tensor. This needs to be done manually in Eager mode quantization. +An e2e example:: + + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.quant = torch.quantization.QuantStub() + self.conv1 = torch.nn.Conv2d(1, 1, 1) + # this module will not be quantized (see `qconfig = None` logic below) + self.conv2 = torch.nn.Conv2d(1, 1, 1) + self.dequant = torch.quantization.DeQuantStub() + + def forward(self, x): + # during the convert step, this will be replaced with a + # `quantize_per_tensor` call + x = self.quant(x) + x = self.conv1(x) + # during the convert step, this will be replaced with a + # `dequantize` call + x = self.dequant(x) + x = self.conv2(x) + return x + + m = M() + m.qconfig = some_qconfig + # turn off quantization for conv2 + m.conv2.qconfig = None + Modules that provide quantization functions and classes ------------------------------------------------------- diff --git a/mypy-strict.ini b/mypy-strict.ini index 42fc73abf1cc..7cc6fff83577 100644 --- a/mypy-strict.ini +++ b/mypy-strict.ini @@ -31,9 +31,11 @@ strict_equality = True files = tools/codegen/gen.py, tools/autograd/gen_annotated_fn_args.py, + tools/autograd/gen_autograd.py, tools/autograd/gen_python_functions.py, tools/autograd/gen_trace_type.py, tools/autograd/gen_variable_factories.py, + tools/autograd/gen_variable_type.py, tools/autograd/load_derivatives.py, torch/utils/benchmark/utils/common.py, torch/utils/benchmark/utils/timer.py, diff --git a/mypy.ini b/mypy.ini index 8c900bcced76..7d6161bddd17 100644 --- a/mypy.ini +++ b/mypy.ini @@ -104,24 +104,6 @@ ignore_errors = True [mypy-torch._utils] ignore_errors = True -[mypy-torch._overrides] -ignore_errors = True - -[mypy-torch.utils.tensorboard._caffe2_graph] -ignore_errors = True - -[mypy-torch.contrib._tensorboard_vis] -ignore_errors = True - -[mypy-torch.nn.utils.prune] -ignore_errors = True - -[mypy-torch.utils.show_pickle] -ignore_errors = True - -[mypy-torch.utils.hipify.hipify_python] -ignore_errors = True - [mypy-torch.utils.benchmark.examples.*] ignore_errors = True diff --git a/scripts/model_zoo/update-models-from-caffe2.py b/scripts/model_zoo/update-models-from-caffe2.py index fb582a047bc6..d3e46e449d8a 100644 --- a/scripts/model_zoo/update-models-from-caffe2.py +++ b/scripts/model_zoo/update-models-from-caffe2.py @@ -6,15 +6,12 @@ import caffe2.python.workspace as c2_workspace import glob import json -import math import numpy as np import onnx import caffe2.python.onnx.frontend import caffe2.python.onnx.backend import os import shutil -import subprocess -import sys import tarfile import tempfile @@ -25,7 +22,6 @@ from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory from caffe2.proto import caffe2_pb2 from onnx import numpy_helper -from filechunkio import FileChunkIO """A script converting Caffe2 models to ONNX, and updating ONNX model zoos. diff --git a/scripts/release_notes/categorize.py b/scripts/release_notes/categorize.py index b72eb9094b7b..985d11f2e2bd 100644 --- a/scripts/release_notes/categorize.py +++ b/scripts/release_notes/categorize.py @@ -1,8 +1,7 @@ -import json import argparse import os import textwrap -from common import dict_to_features, categories, topics, get_features, CommitDataCache +from common import categories, topics, CommitDataCache from commitlist import CommitList class Categorizer: diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py index fda7c913addd..0a76f896f217 100644 --- a/scripts/release_notes/commitlist.py +++ b/scripts/release_notes/commitlist.py @@ -1,6 +1,6 @@ import argparse from common import run, topics -from collections import namedtuple, defaultdict +from collections import defaultdict import os import csv import pprint diff --git a/setup.py b/setup.py index 01f173d6825b..50983a89ad55 100644 --- a/setup.py +++ b/setup.py @@ -186,7 +186,7 @@ python_min_version_str)) sys.exit(-1) -from setuptools import setup, Extension, distutils, find_packages +from setuptools import setup, Extension, find_packages from collections import defaultdict from distutils import core from distutils.core import Distribution @@ -892,6 +892,7 @@ def print_box(msg): 'include/torch/csrc/jit/serialization/*.h', 'include/torch/csrc/jit/python/*.h', 'include/torch/csrc/jit/testing/*.h', + 'include/torch/csrc/jit/tensorexpr/*.h', 'include/torch/csrc/onnx/*.h', 'include/torch/csrc/utils/*.h', 'include/pybind11/*.h', diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp index e4bb96ece6fb..3f79c771c2be 100644 --- a/test/cpp/api/autograd.cpp +++ b/test/cpp/api/autograd.cpp @@ -175,7 +175,7 @@ TEST(AutogradAPITests, AnomalyMode) { auto y = x.pow(1.5); auto gr = grad({y}, {x}, {}, /*retain_graph=*/true, /*create_backward=*/true); - ASSERT_THROWS_WITH(grad({gr[0]}, {x});, "returned nan"); + ASSERT_THROWS_WITH(grad({gr[0]}, {x}, {torch::tensor({0.0})});, "returned nan"); auto msgs = warnings.messages(); ASSERT_EQ(msgs.size(), 2); ASSERT_TRUE( diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp index 160075d0d268..a8d6320e9533 100644 --- a/test/cpp/api/misc.cpp +++ b/test/cpp/api/misc.cpp @@ -82,3 +82,11 @@ TEST_F(AutogradTest, CanPassCustomGradientInputs) { z.sum().backward(torch::ones({}) * 2); ASSERT_TRUE(x.grad().allclose(y * 2)); } + +TEST(UtilsTest, AmbiguousOperatorDefaults) { + auto tmp = at::empty({}, at::kCPU); + at::_test_ambiguous_defaults(tmp); + at::_test_ambiguous_defaults(tmp, 1); + at::_test_ambiguous_defaults(tmp, 1, 1); + at::_test_ambiguous_defaults(tmp, 2, "2"); +} diff --git a/test/cpp/api/tensor_indexing.cpp b/test/cpp/api/tensor_indexing.cpp index efb153fbf481..03600c5c882e 100644 --- a/test/cpp/api/tensor_indexing.cpp +++ b/test/cpp/api/tensor_indexing.cpp @@ -83,27 +83,27 @@ TEST(TensorIndexingTest, TestNoIndices) { ASSERT_THROWS_WITH(tensor.index_put_(indices, value), "Passing an empty index list to Tensor::index_put_() is not valid syntax"); } -TEST(TensorIndexingTest, TestAdvancedIndexingWithArrayRefOfTensor) { +TEST(TensorIndexingTest, TestAdvancedIndexingWithListOfTensor) { { torch::Tensor tensor = torch::randn({20, 20}); torch::Tensor index = torch::arange(10, torch::kLong).cpu(); - torch::Tensor result_with_array_ref = tensor.index(at::ArrayRef({index})); + torch::Tensor result = at::index(tensor, {index}); torch::Tensor result_with_init_list = tensor.index({index}); - ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list)); + ASSERT_TRUE(result.equal(result_with_init_list)); } { torch::Tensor tensor = torch::randn({20, 20}); torch::Tensor index = torch::arange(10, torch::kLong).cpu(); - torch::Tensor result_with_array_ref = tensor.index_put_(at::ArrayRef({index}), torch::ones({20})); + torch::Tensor result = at::index_put_(tensor, {index}, torch::ones({20})); torch::Tensor result_with_init_list = tensor.index_put_({index}, torch::ones({20})); - ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list)); + ASSERT_TRUE(result.equal(result_with_init_list)); } { torch::Tensor tensor = torch::randn({20, 20}); torch::Tensor index = torch::arange(10, torch::kLong).cpu(); - torch::Tensor result_with_array_ref = tensor.index_put_(at::ArrayRef({index}), torch::ones({1, 20})); + torch::Tensor result = at::index_put_(tensor, {index}, torch::ones({1, 20})); torch::Tensor result_with_init_list = tensor.index_put_({index}, torch::ones({1, 20})); - ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list)); + ASSERT_TRUE(result.equal(result_with_init_list)); } } @@ -173,7 +173,7 @@ TEST(TensorIndexingTest, TestBoolIndices) { TEST(TensorIndexingTest, TestBoolIndicesAccumulate) { auto mask = torch::zeros({10}, torch::kBool); auto y = torch::ones({10, 10}); - y.index_put_({mask}, y.index({mask}), /*accumulate=*/true); + y.index_put_({mask}, {y.index({mask})}, /*accumulate=*/true); assert_tensor_equal(y, torch::ones({10, 10})); } diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp index 2e59358b4e00..e102a6ff767c 100644 --- a/test/cpp/jit/test_save_load.cpp +++ b/test/cpp/jit/test_save_load.cpp @@ -120,5 +120,33 @@ TEST(SerializationTest, TypeTags) { } } +TEST(SerializationTest, TestJitStream_CUDA) { + torch::jit::Module model; + std::vector inputs; + // Deserialize the ScriptModule from a file using torch::jit::load(). + // Load the scripted model. This should have been generated by tests_setup.py + // Refer: TorchSaveJitStream_CUDA in test/cpp/jit/tests_setup.py + model = torch::jit::load("saved_stream_model.pt"); + + auto output = model.forward(inputs); + auto list_of_elements = output.toTuple()->elements(); + auto is_stream_s = list_of_elements[0].toBool(); + + // a,b: These are the two input tensors + // c: This is output tensor generated by the operation torch.cat(a,b) + auto a = list_of_elements[1].toTensor(); + auto b = list_of_elements[2].toTensor(); + auto c = list_of_elements[3].toTensor(); + // op: this is used to verify if the cat operation produced the same results + // as that on the GPU with torch.cat + auto op = at::cat({a, b}, 0); + + // Check if the stream is set + ASSERT_TRUE(is_stream_s); + // Check if the sizes of the outputs (op and c) is same on the GPU and CPU + ASSERT_EQ(op.sizes(), c.sizes()); + // Check if both the output tensors are equal + ASSERT_TRUE(op.equal(c)); +} } // namespace jit } // namespace torch diff --git a/test/cpp/jit/tests_setup.py b/test/cpp/jit/tests_setup.py index 68871d1c21d2..928a06d9b5a0 100644 --- a/test/cpp/jit/tests_setup.py +++ b/test/cpp/jit/tests_setup.py @@ -63,11 +63,38 @@ def setup(self): torch.save(value, self.path, _use_new_zipfile_serialization=False) +class TorchSaveJitStream_CUDA(FileSetup): + path = 'saved_stream_model.pt' + + def setup(self): + if not torch.cuda.is_available(): + return + + class Model(torch.nn.Module): + def forward(self): + device_index = torch.cuda._current_device() + s = torch.jit.cuda.Stream(device_index, 0) + a = torch.rand(3, 4, device="cuda") + b = torch.rand(3, 4, device="cuda") + + with torch.jit.cuda.stream(s): + is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id() + c = torch.cat((a, b), 0).to("cuda") + s.synchronize() + return is_stream_s, a, b, c + + model = Model() + + # Script the model and save + script_model = torch.jit.script(model) + torch.jit.save(script_model, self.path) + tests = [ EvalModeForLoadedModule(), SerializationInterop(), TorchSaveError(), + TorchSaveJitStream_CUDA() ] def setup(): diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index cf658ad488f6..902c2a701197 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -19,6 +19,65 @@ namespace jit { using namespace torch::indexing; using namespace torch::jit::tensorexpr; +TEST(Kernel, InliningIntermediates) { + // here, each mul has only one use, so it should be completely inlined + { + const auto graph_string = R"IR( + graph(%0 : Float(5, 3, strides=[3, 1], device=cpu), + %1 : Float(5, 3, strides=[3, 1], device=cpu)): + %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1) + %one : int = prim::Constant[value=1]() + %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2) + %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one) + return (%5))IR"; + KernelScope kernel_scope; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + TensorExprKernel k(graph); + auto stmt = k.getCodeGenStmt(); + std::ostringstream oss; + oss << *stmt; + torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str()); + } + { + const auto graph_template = R"IR( + graph(%0 : Float(5, 3, strides=[3, 1], device=${device}), + %1 : Float(5, 3, strides=[3, 1], device=${device})): + %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1) + %one : int = prim::Constant[value=1]() + %3 : Float(5, 3, strides=[3, 1]) = aten::sub(%0, %2, %one) + %4 : Float(5, 3, strides=[3, 1]) = aten::add(%3, %0, %one) + %5 : Float(5, 3, strides=[3, 1]) = aten::div(%3, %0) + return (%4, %5))IR"; + for (bool use_cuda : {false, true}) { + if (!torch::cuda::is_available() && use_cuda) { + continue; + } + + KernelScope kernel_scope; + TemplateEnv env; + env.s("device", use_cuda ? "cuda:0" : "cpu"); + const auto graph_string = format(graph_template, env); + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + auto device = use_cuda ? kCUDA : kCPU; + TensorExprKernel k(graph); + auto stmt = k.getCodeGenStmt(); + std::ostringstream oss; + oss << *stmt; + // aten_mul only has one use, inlined completely + torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str()); + + // aten_sub should be removed in cuda, exist in cpu + // 5 uses: allocate, initialize, free and two reads + size_t num_out1_uses = use_cuda ? 0 : 5; + torch::jit::testing::FileCheck() + .check_count("aten_sub", num_out1_uses, /*exactly*/ true) + ->run(oss.str()); + } + } +} + TEST(Kernel, _1) { KernelScope kernel_scope; @@ -714,7 +773,10 @@ TEST(Kernel, Softmax2D) { ver_env.d("softmax_dim", softmax_dim); ver_env.d("softmax_dim_size", softmax_dim_size); const auto verification_pattern = format(verification_template, ver_env); - torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + // verication sting temporarily disabled until + // inlining of exp() is benchmarked and determined + // torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); std::vector stack = fmap(inputs); k.run(stack); @@ -789,7 +851,10 @@ TEST(Kernel, Softmax3D) { ver_env.d("softmax_dim", softmax_dim); ver_env.d("softmax_dim_size", softmax_dim_size); const auto verification_pattern = format(verification_template, ver_env); - torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + // verication sting temporarily disabled until + // inlining of exp() is benchmarked and determined + // torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); std::vector stack = fmap(inputs); k.run(stack); @@ -870,7 +935,10 @@ TEST(Kernel, Softmax4D) { ver_env.d("softmax_dim", softmax_dim); ver_env.d("softmax_dim_size", softmax_dim_size); const auto verification_pattern = format(verification_template, ver_env); - torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); + + // verication sting temporarily disabled until + // inlining of exp() is benchmarked and determined + // torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); std::vector stack = fmap(inputs); k.run(stack); diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index af95f9971513..7294aa53c803 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -3649,45 +3649,6 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) { torch::jit::testing::FileCheck().run(expected_ir2, oss.str()); } -TEST(LoopNest, InlineOutputBuffers) { - KernelScope kernel_scope; - const int M = 4; - const int N = 5; - const int K = 6; - Placeholder a_buf("a", kFloat, {M, N}); - Placeholder b_buf("b", kFloat, {N, K}); - Tensor* c = Compute( - "broadcast_add", - {{M, "m"}, {N, "n"}, {K, "k"}}, - [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return a_buf.load(m, n) + b_buf.load(n, k); - }); - Tensor* out1 = Compute( - "out1", - {{M, "m"}, {N, "n"}, {K, "k"}}, - [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return c->call(m, n, k) + 1; - }); - - Tensor* out2 = Compute( - "out2", - {{M, "m"}, {N, "n"}, {K, "k"}}, - [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { - return out1->call(m, n, k) / c->call(m, n, k) * 4; - }); - for (const bool inline_outputs : {true, false}) { - LoopNest l({out1, out2}); - l.inlineIntermediateBufs(inline_outputs); - Stmt* stmt1 = l.root_stmt(); - std::ostringstream oss; - oss << *stmt1; - size_t num_out1_uses = inline_outputs ? 1 : 2; - torch::jit::testing::FileCheck() - .check_count("out1", num_out1_uses, /*exactly*/ true) - ->run(oss.str()); - } -} - TEST(LoopNest, CompoundTensorSimple) { KernelScope kernel_scope; diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 5ffd4b4fb088..93e26be7ee98 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -4641,6 +4641,43 @@ def test_nccl_barrier_timeout_new_group_non_member(self): with self.assertRaisesRegex(RuntimeError, "Timed out initializing process group"): c10d.new_group([0], timeout=timedelta(seconds=1)) + @requires_nccl() + @skip_if_not_multigpu + def test_nccl_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store) + + c10d.barrier(device_ids=[self.rank]) + + @requires_nccl() + @skip_if_not_multigpu + def test_nccl_barrier_device_ids_function_argument(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store) + + with self.assertRaisesRegex(RuntimeError, "Invalid function argument"): + c10d.barrier(device_ids=self.rank) + + @requires_gloo() + def test_gloo_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="gloo", + rank=self.rank, + world_size=self.world_size, + store=store) + + with self.assertRaisesRegex(RuntimeError, "device_ids not supported"): + c10d.barrier(device_ids=[self.rank]) + if __name__ == "__main__": assert ( not torch.cuda._initialized diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index b057d12a285d..8c927f35fd2e 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -727,7 +727,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params): # performs gradient checks on log_prob distribution = dist_ctor(*ctor_params) s = distribution.sample() - if s.is_floating_point(): + if not distribution.support.is_discrete: s = s.detach().requires_grad_() expected_shape = distribution.batch_shape + distribution.event_shape @@ -1422,7 +1422,7 @@ def test_uniform(self): self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,)) # Check log_prob computation when value outside range - uniform = Uniform(low_1d, high_1d) + uniform = Uniform(low_1d, high_1d, validate_args=False) above_high = torch.tensor([4.0]) below_low = torch.tensor([-1.0]) self.assertEqual(uniform.log_prob(above_high).item(), -inf) @@ -1517,7 +1517,7 @@ def test_halfcauchy(self): def test_halfnormal(self): std = torch.randn(5, 5).abs().requires_grad_() - std_1d = torch.randn(1, requires_grad=True) + std_1d = torch.randn(1).abs().requires_grad_() std_delta = torch.tensor([1e-5, 1e-5]) self.assertEqual(HalfNormal(std).sample().size(), (5, 5)) self.assertEqual(HalfNormal(std).sample((7,)).size(), (7, 5, 5)) @@ -1978,6 +1978,8 @@ def gradcheck_func(samples, mu, sigma, prec, scale_tril): sigma = 0.5 * (sigma + sigma.transpose(-1, -2)) # Ensure symmetry of covariance if prec is not None: prec = 0.5 * (prec + prec.transpose(-1, -2)) # Ensure symmetry of precision + if scale_tril is not None: + scale_tril = scale_tril.tril() return MultivariateNormal(mu, sigma, prec, scale_tril).log_prob(samples) gradcheck(gradcheck_func, (mvn_samples, mean, covariance, precision, scale_tril), raise_exception=True) @@ -2643,7 +2645,7 @@ def test_cdf_log_prob(self): for i, param in enumerate(params): dist = Dist(**param) samples = dist.sample() - if samples.dtype.is_floating_point: + if not dist.support.is_discrete: samples.requires_grad_() try: cdfs = dist.cdf(samples) @@ -3050,11 +3052,9 @@ def setUp(self): self.scalar_sample = 1 self.tensor_sample_1 = torch.ones(3, 2) self.tensor_sample_2 = torch.ones(3, 2, 3) - Distribution.set_default_validate_args(True) def tearDown(self): super(TestDistributionShapes, self).tearDown() - Distribution.set_default_validate_args(False) def test_entropy_shape(self): for Dist, params in EXAMPLES: @@ -3186,23 +3186,23 @@ def test_one_hot_categorical_shape(self): self.assertEqual(dist.sample().size(), torch.Size((3,))) self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3))) self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1) - simplex_sample = self.tensor_sample_2 / self.tensor_sample_2.sum(-1, keepdim=True) - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 2,))) + sample = torch.tensor([0., 1., 0.]).expand(3, 2, 3) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 2,))) self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,))) - simplex_sample = torch.ones(3, 3) / 3 - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,))) + sample = torch.eye(3) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,))) # batched dist = OneHotCategorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]])) self.assertEqual(dist._batch_shape, torch.Size((3,))) self.assertEqual(dist._event_shape, torch.Size((2,))) self.assertEqual(dist.sample().size(), torch.Size((3, 2))) self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2))) - simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True) - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,))) + sample = torch.tensor([0., 1.]) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,))) self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2) self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3))) - simplex_sample = torch.ones(3, 1, 2) / 2 - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3))) + sample = torch.tensor([0., 1.]).expand(3, 1, 2) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 3))) def test_cauchy_shape_scalar_params(self): cauchy = Cauchy(0, 1) @@ -3531,12 +3531,15 @@ def __init__(self, probs): [0.2, 0.7, 0.1], [0.33, 0.33, 0.34], [0.2, 0.2, 0.6]]) - pareto = pairwise(Pareto, [2.5, 4.0, 2.5, 4.0], [2.25, 3.75, 2.25, 3.75]) + pareto = (Pareto(torch.tensor([2.5, 4.0, 2.5, 4.0]).expand(4, 4), + torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)), + Pareto(torch.tensor([2.25, 3.75, 2.25, 3.8]).expand(4, 4), + torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4))) poisson = pairwise(Poisson, [0.3, 1.0, 5.0, 10.0]) - uniform_within_unit = pairwise(Uniform, [0.15, 0.95, 0.2, 0.8], [0.1, 0.9, 0.25, 0.75]) + uniform_within_unit = pairwise(Uniform, [0.1, 0.9, 0.2, 0.75], [0.15, 0.95, 0.25, 0.8]) uniform_positive = pairwise(Uniform, [1, 1.5, 2, 4], [1.2, 2.0, 3, 7]) uniform_real = pairwise(Uniform, [-2., -1, 0, 2], [-1., 1, 1, 4]) - uniform_pareto = pairwise(Uniform, [6.5, 8.5, 6.5, 8.5], [7.5, 7.5, 9.5, 9.5]) + uniform_pareto = pairwise(Uniform, [6.5, 7.5, 6.5, 8.5], [7.5, 8.5, 9.5, 9.5]) continuous_bernoulli = pairwise(ContinuousBernoulli, [0.1, 0.2, 0.5, 0.9]) # These tests should pass with precision = 0.01, but that makes tests very expensive. @@ -4148,8 +4151,8 @@ def test_lazy_logits_initialization(self): probs = param.pop('probs') param['logits'] = probs_to_logits(probs) dist = Dist(**param) - shape = (1,) if not dist.event_shape else dist.event_shape - dist.log_prob(torch.ones(shape)) + # Create new instance to generate a valid sample + dist.log_prob(Dist(**param).sample()) message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params)) self.assertFalse('probs' in vars(dist), msg=message) try: @@ -4455,7 +4458,6 @@ def test_stack_transform(self): class TestValidation(TestCase): def setUp(self): super(TestCase, self).setUp() - Distribution.set_default_validate_args(True) def test_valid(self): for Dist, params in EXAMPLES: @@ -4475,7 +4477,6 @@ def test_invalid(self): def tearDown(self): super(TestValidation, self).tearDown() - Distribution.set_default_validate_args(False) class TestJit(TestCase): diff --git a/test/elias.py b/test/elias.py deleted file mode 100644 index 74dbc3cbaa09..000000000000 --- a/test/elias.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch - -@torch.jit.script -def foo(x): - return x + x + x - -torch._C._jit_override_can_fuse_on_cpu(True) - -foo(torch.rand([2], requires_grad=False)) -foo(torch.rand([2], requires_grad=False)) -foo(torch.rand([2], requires_grad=False)) -print(torch.jit.last_executed_optimized_graph()) diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py new file mode 100644 index 000000000000..f7af8e3a2efc --- /dev/null +++ b/test/jit/test_cuda.py @@ -0,0 +1,476 @@ +import os +import sys +import gc +import unittest + +import torch +from typing import NamedTuple +from torch.testing._internal.jit_utils import JitTestCase +from torch.testing._internal.common_utils import skipIfRocm, skipCUDANonDefaultStreamIf + +# Make the helper files in test/ importable +pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) +sys.path.append(pytorch_test_dir) + +# Check if GPU is available +TEST_CUDA = torch.cuda.is_available() +# Check if multiple GPU's are available +TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2 + +# If GPU is not available, then do not run the tests +if not TEST_CUDA: + print('CUDA not available, skipping tests', file=sys.stderr) + JitTestCase = object # noqa: F811 + +TEST_LARGE_TENSOR = TEST_CUDA + +# If GPU is available, then initialize the cuda context and check +# if there is memory available to allocate for LARGE Tensors. +if TEST_CUDA: + torch.ones(1).cuda() # initialize cuda context + TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 5e9 + +if __name__ == "__main__": + raise RuntimeError( + "This test file is not meant to be run directly, use:\n\n" + "\tpython test/test_jit.py TESTNAME\n\n" + "instead." + ) + +class TestCUDA(JitTestCase): + """ + A suite of tests for the CUDA API in TorchScript. + """ + def setUp(self): + super(TestCUDA, self).setUp() + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + super(TestCUDA, self).tearDown() + + @skipIfRocm + @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + def test_current_stream(self): + # Test current stream on the device and check if the stream device index + # matches with the device ID + @torch.jit.script + def fn(): + device_index = torch.cuda._current_device() + s0 = torch.cuda.current_stream(device_index) + s1 = torch.cuda.current_stream(1) + s2 = torch.cuda.current_stream(0) + + return s0.device_index(), s1.device_index(), s2.device_index() + + d0, d1, d2 = fn() + + # By default, the current device ID is 0. + self.assertEqual(0, d0) + self.assertEqual(1, d1) + self.assertEqual(0, d2) + self.assertEqual(d0, d2) + + @skipIfRocm + @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") + @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory") + @skipCUDANonDefaultStreamIf(True) + def test_streams_and_events(self): + # This test checks for the default stream ID is set to 0 on the device + @torch.jit.script + def test_default_streams(): + s0 = torch.cuda.default_stream(0) + s1 = torch.cuda.default_stream(1) + + d = torch.device('cuda:1') + + # Check the current stream id and default id are same + # on the current device. The current device id by default is 0 + s2 = torch.cuda.current_stream(0) + check_s2 = s2.id() == s0.id() + check_d0 = torch.cuda._current_device() == s2.device_index() + + # Set the current device to d1 and check if the stream + # has been set to the default stream on d1 + with torch.jit.cuda.device(d): + s3 = torch.cuda.current_stream(1) + check_s3 = s3.id() == s1.id() + check_d1 = torch.cuda._current_device() == s3.device_index() + + # Check if the current device was reset to 0 + is_device_d0 = torch.cuda._current_device() == s2.device_index() + + return s0.device_index(), s1.device_index(), check_s2, check_s3, check_d0, check_d1, is_device_d0 + + d0, d1, check_s2, check_s3, check_d0, check_d1, is_device_d0 = test_default_streams() + + self.assertEqual(d0, 0) + self.assertEqual(d1, 1) + self.assertTrue(check_s2) + self.assertTrue(check_s3) + self.assertTrue(check_d0) + self.assertTrue(check_d1) + self.assertTrue(is_device_d0) + + # This test checks if the Stream Context manager is a no op + # when the stream is none for `with torch.jit.cuda.stream` + @torch.jit.script + def test_set_none_stream(): + device_index = torch.cuda._current_device() + current_stream = torch.cuda.current_stream(device_index) + default_stream = torch.cuda.default_stream(device_index) + + # When stream is none, check if this operation is a no-op + with torch.jit.cuda.stream(None): + cur_device_index = torch.cuda._current_device() + is_device_index_same = cur_device_index == device_index + is_current_stream_same = torch.cuda.current_stream(cur_device_index).id() == current_stream.id() + is_default_stream_same = torch.cuda.default_stream(device_index).id() == default_stream.id() + + # Check if the device index, current stream and default streams have not changed + are_streams_same = is_device_index_same and is_current_stream_same and is_default_stream_same + return are_streams_same + self.assertTrue(test_set_none_stream()) + + # This test checks if the Device Context manager is a no op + # when the device is none for `with torch.jit.cuda.device` + @torch.jit.script + def test_set_device_none(): + device_index = torch.cuda._current_device() + # When device is none, check if this operation is a no-op + with torch.jit.cuda.device(None): + # Check if the current device is the same + is_device_same = torch.cuda._current_device() == device_index + return is_device_same + self.assertTrue(test_set_device_none()) + + # Check if a CUDA JIT stream is created + # on the _current_device + @torch.jit.script + def test_simple_stream(): + device_index = torch.cuda._current_device() + s = torch.jit.cuda.Stream(device_index, 0) + return device_index == s.device_index() + + self.assertTrue(test_simple_stream(), "Could not create Stream!") + + # Class used to store results for the test: test_get_stream. + class Result(NamedTuple): + t1 : torch.Tensor + t2 : torch.Tensor + is_current_and_default_stream_same : bool + is_default_and_user_stream_not_same : bool + is_stream_set : bool + is_stream_reset : bool + default_stream_query : bool + default_stream_id : int + user_stream_id : int + + # The test aims at checking different stream proporties. + @torch.jit.script + def test_get_stream(): + device_index = torch.cuda._current_device() + current_stream = torch.cuda.current_stream(device_index) + default_stream = torch.cuda.default_stream(device_index) + user_stream = torch.jit.cuda.Stream(device_index, 0) + + # Check if the current and default streams are the same on the device + is_current_and_default_stream_same = current_stream.id() == default_stream.id() + # Check if user stream and default stream are not the same on the device + is_default_and_user_stream_not_same = default_stream.id() != user_stream.id() + + with torch.jit.cuda.stream(user_stream): + is_stream_set = torch.cuda.current_stream(device_index).id() == user_stream.id() + + # Check if the stream was reset to current_stream + is_stream_reset = torch.cuda.current_stream(device_index).id() == current_stream.id() + + tensor1 = torch.rand(10000, 10000, device="cuda") + tensor2 = torch.mm(tensor1, tensor1).to("cuda") + default_stream.synchronize() + default_stream_query = default_stream.query() + + # Capture all the results in the class Result + res = Result( + tensor1, tensor2, is_current_and_default_stream_same, + is_default_and_user_stream_not_same, is_stream_set, + is_stream_reset, default_stream_query, default_stream.id(), user_stream.id()) + return res + + result = test_get_stream() + + self.assertEqual(torch.matmul(result.t1, result.t1), result.t2) + self.assertTrue(result.is_current_and_default_stream_same) + self.assertTrue(result.is_default_and_user_stream_not_same) + self.assertTrue(result.is_stream_set) + self.assertTrue(result.is_stream_reset) + self.assertTrue(result.default_stream_query) + self.assertEqual(result.default_stream_id, 0) # Check if the default stream ID is always 0 + self.assertNotEqual(result.user_stream_id, 0) # Check if the user stream is always non zero + + # Test the stream context manager. This test checks if the stream is switched + # to the user stream on using the stream context manager. + @torch.jit.script + def test_stream_context(): + device_index = torch.cuda._current_device() + current_stream = torch.cuda.current_stream(device_index) + user_stream = torch.jit.cuda.Stream(device_index, 0) + A = torch.rand(1000, 1000, device="cuda") + + with torch.jit.cuda.stream(user_stream): + check = torch.cuda.current_stream(device_index).id() == user_stream.id() + B = torch.mm(A, A).to("cuda") + # Wait for B to be computed + user_stream.synchronize() + # Check if the stream has been reset on the current device + is_stream_reset = torch.cuda.current_stream(device_index).id() == current_stream.id() + + return A, B, check, is_stream_reset + + A, B, is_stream_set, is_stream_reset = test_stream_context() + self.assertEqual(torch.matmul(A, A), B) + self.assertTrue(is_stream_set, "Error: Current stream was not set to user stream!") + self.assertTrue(is_stream_reset, "Error: The stream was not restored to previous stream!") + + # Test multiple nested streams. Check if the operations are computed as expected on the streams + # This test has been adapted from the eager mode tests available at test/test_cuda.py + @torch.jit.script + def test_multiple_stream(): + prev_device_index = torch.cuda._current_device() + prev_current_stream = torch.cuda.current_stream(prev_device_index) + s1 = torch.jit.cuda.Stream(0, 0) + s2 = torch.jit.cuda.Stream(1, 0) + + A = torch.rand(1000, 1000, device="cuda") + B = torch.rand(1000, 1000, device="cuda") + with torch.jit.cuda.stream(s1): + C = torch.mm(A, A).to("cuda") + # Check if the stream and device have been set to s1 + is_stream_s1 = torch.cuda.current_stream(s1.device_index()).id() == s1.id() + is_device_s1 = torch.cuda._current_device() == s1.device_index() + with torch.jit.cuda.stream(s2): + # Check if the stream and device have been set to s2 + is_stream_s2 = torch.cuda.current_stream(s2.device_index()).id() == s2.id() + is_device_s2 = torch.cuda._current_device() == s2.device_index() + D = torch.mm(B, B).to("cuda") + # Check if the stream and device have been set to s1 + is_stream_s1_after = torch.cuda.current_stream(s1.device_index()).id() == s1.id() + is_device_s1_after = torch.cuda._current_device() == s1.device_index() + # Wait for D to be computed + s2.synchronize() + # Wait for C to be computed on S1 + s1.synchronize() + + # Check if the stream and device has been restored to previous stream and device + is_device_current = torch.cuda._current_device() == prev_device_index + is_stream_current = torch.cuda.current_stream(prev_device_index).id() == prev_current_stream.id() + + check_stream = is_stream_s1 and is_stream_s2 and is_stream_s1_after and is_stream_current + check_device = is_device_s1 and is_device_s2 and is_device_s1_after and is_device_current + return A, B, C, D, check_stream, check_device + A, B, C, D, check_stream, check_device = test_multiple_stream() + + self.assertEqual(torch.matmul(A, A), C) + self.assertEqual(torch.matmul(B, B), D) + self.assertTrue(check_stream) + self.assertTrue(check_device) + + # Test multiple streams waiting on each other for the operations to be completed. + @torch.jit.script + def test_data_dependency_between_streams(): + device_index = torch.cuda._current_device() + prev_current_stream = torch.cuda.current_stream(device_index) + s1 = torch.jit.cuda.Stream(0, 0) + s2 = torch.jit.cuda.Stream(0, 0) + event = torch.jit.cuda.Event(False, False, False) + + A = torch.rand(1000, 1000, device="cuda") + with torch.jit.cuda.stream(s1): + is_stream_s1 = torch.cuda.current_stream(device_index).id() == s1.id() + B = torch.mm(A, A).to("cuda") + s1.record_event(event) + # Check if the current_stream is reset + is_current_stream_1 = torch.cuda.current_stream(device_index).id() == prev_current_stream.id() + # Wait for ops on s1 to be computed + s2.wait_event(event) + with torch.jit.cuda.stream(s2): + is_stream_s2 = torch.cuda.current_stream(device_index).id() == s2.id() + C = torch.mm(B, B).to("cuda") + # Wait for C to be computed + s2.synchronize() + # Check if the current_stream is reset + is_current_stream_2 = torch.cuda.current_stream(device_index).id() == prev_current_stream.id() + + check_stream = is_current_stream_1 and is_current_stream_2 and is_stream_s1 and is_stream_s2 + return A, B, C, check_stream + + A, B, C, check_stream = test_data_dependency_between_streams() + self.assertEqual(torch.matmul(A, A), B) + self.assertEqual(torch.matmul(B, B), C) + self.assertTrue(check_stream) + + # Test a simple CUDA event. Test if the CUDA event was created successfully + @torch.jit.script + def test_simple_event(): + e = torch.jit.cuda.Event(True, False, False) + return e is not None + self.assertTrue(test_simple_event(), "Could not create CUDA Event!") + + # Record the CUDA event for operation torch.mm on the current stream + # and then test if the elapsed time is greater than 0. This test is also + # an adaption from eager mdoe CUDA tests available at test/test_cuda.py + @torch.jit.script + def test_event(): + device_index = torch.cuda._current_device() + stream = torch.cuda.current_stream(device_index) + event = torch.jit.cuda.Event(True, False, False) + is_true_event_query = event.query() + start_event = torch.jit.cuda.Event(True, False, False) + stream.record_event(start_event) + tensor1 = torch.rand(1000000000, 1000000000, device="cuda") + tensor2 = torch.mm(tensor1, tensor1).to("cuda") + stream.record_event(event) + event.synchronize() + is_again_true_event_query = event.query() + + if not (is_true_event_query and is_again_true_event_query): + return -1.0 + return start_event.elapsed_time(event) + + self.assertGreater(test_event(), 0) + + # Check for stream synchronization , when a large tensor multiplication is + # computed on the stream. The stream.query should be true once the synchroniztion is done + @torch.jit.script + def test_stream_synchronize() -> float: + device_index = torch.cuda._current_device() + s = torch.jit.cuda.Stream(device_index, 0) + e_tik = torch.jit.cuda.Event(True, False, False) + e_tok = torch.jit.cuda.Event(True, False, False) + + e_tik.record(s) + tensor1 = torch.rand(1000000000, 1000000000, device="cuda") + with torch.jit.cuda.stream(s): + tensor2 = torch.mm(tensor1, tensor1).to("cuda") + s.synchronize() + e_tok.record(s) + e_tok.synchronize() + + if not s.query(): + return -1.0 + + # not necessary to check e_tik and e_tok, as elapsed_time would throw + # exception if otherwise. + return e_tik.elapsed_time(e_tok) + self.assertGreater(test_stream_synchronize(), 0) + + # Test event synchronization for the event that records a stream doing + # a large tensor multiplication. Check if the elapsed time is greater than 0 + # and the stream.query evaluates to true. + @torch.jit.script + def test_event_synchronize() -> float: + device_index = torch.cuda._current_device() + s = torch.jit.cuda.Stream(device_index, 0) + e_tik = torch.jit.cuda.Event(True, False, False) + e_tok = torch.jit.cuda.Event(True, False, False) + + e_tik.record(s) + tensor1 = torch.rand(1000000000, 1000000000, device="cuda") + with torch.jit.cuda.stream(s): + tensor = torch.mm(tensor1, tensor1).to("cuda") + s.record_event(e_tok) + e_tok.synchronize() + s.synchronize() + + if not s.query(): + return -1.0 + + # not necessary to check e_tik and e_tok, as elapsed_time would throw + # exception if otherwise. + return e_tik.elapsed_time(e_tok) + + self.assertGreater(test_event_synchronize(), 0) + + # Test for event wait. Check if event waits for the all the operations on + # the stream to be done. Check for synchronizations and query on the streams + # and events. This test is adapted from eager mode tests for CUDA. Please refer + # test/test_cuda.py + @torch.jit.script + def test_event_wait() -> float: + device_index = torch.cuda._current_device() + s0 = torch.cuda.current_stream(device_index) + s1 = torch.jit.cuda.Stream(device_index, 0) + e_tik = torch.jit.cuda.Event(True, True, False) + e_tok = torch.jit.cuda.Event(True, True, False) + + e_tik.record(s0) + tensor1 = torch.rand(1000000000, 1000000000, device="cuda") + with torch.jit.cuda.stream(s0): + tensor2 = torch.mm(tensor1, tensor1).cuda() + e_sync = torch.jit.cuda.Event(True, False, False) + e_sync.record(torch.cuda.current_stream(device_index)) + e_sync.wait(s1) + with torch.jit.cuda.stream(s1): + tensor3 = torch.rand(1000000000, 1000000000, device="cuda") + tensor4 = torch.mm(tensor3, tensor3).cuda() + s1.synchronize() + e_tok.record(torch.cuda.current_stream(device_index)) + e_tok.synchronize() + s0.synchronize() + + if not s0.query() or not s1.query() or not e_sync.query(): + return -1.0 + + # not necessary to check e_tik and e_tok, as elapsed_time would throw + # exception if otherwise. + return e_tik.elapsed_time(e_tok) + self.assertGreater(test_event_wait(), 0) + + # Test for stream wait_event. Checks if the stream waits on the event + @torch.jit.script + def test_wait_event(): + d1 = torch.device('cuda:1') + + with torch.jit.cuda.device(d1): + s0 = torch.cuda.current_stream(1) + tensor1 = torch.rand(1000000000, 1000000000, device="cuda") + tensor2 = torch.mm(tensor1, tensor1).to("cuda") + e0 = torch.jit.cuda.Event(False, False, False) + s0.record_event(e0) + + s1 = torch.cuda.current_stream(0) + s1.wait_event(e0) + s1.synchronize() + + return e0.query() and s0.query() and s1.query() + self.assertTrue(test_wait_event()) + + # Test if a scripted module with cuda streams can be saved, loaded and executed + def test_save_load(self): + class Model(torch.nn.Module): + def forward(self): + device_index = torch.cuda._current_device() + s = torch.jit.cuda.Stream(device_index, 0) + a = torch.rand(3, 4, device="cuda") + b = torch.rand(3, 4, device="cuda") + + with torch.jit.cuda.stream(s): + is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id() + c = torch.cat((a, b), 0).cuda() + s.synchronize() + return is_stream_s, a, b, c + + model = Model() + + # Script the model and save + script_model = torch.jit.script(model) + is_stream_s, a, b, c = script_model() + # Verify if the output is correct + self.assertTrue(is_stream_s) + self.assertEqual(torch.cat((a, b), 0), c) + + # Save and load scripted model + load_model = self.getExportImportCopy(script_model) + is_stream_s, a_load, b_load, c_load = load_model() + self.assertTrue(is_stream_s) + self.assertEqual(torch.cat((a_load, b_load), 0), c_load) diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py index bd9a2bb32b89..a0dc99a4e463 100644 --- a/test/jit/test_recursive_script.py +++ b/test/jit/test_recursive_script.py @@ -495,6 +495,59 @@ def forward(self, x): self.checkModule(M(), (torch.randn(5, 5),)) + def test_prepare_scriptable_basic(self): + class SeluButReluWhenScripted(torch.nn.SELU): + def __prepare_scriptable__(self): + return nn.ReLU() + + t = torch.randn(5, 5) + m = SeluButReluWhenScripted() + sm = torch.jit.script(m) + eager_out = m(t) + script_out = sm(t) + self.assertNotEqual(eager_out, script_out) + + def test_prepare_scriptable_iterable_modules(self): + class SeluButReluWhenScripted(torch.nn.SELU): + def __prepare_scriptable__(self): + return nn.ReLU() + + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + shared = SeluButReluWhenScripted() + self.sequential = nn.Sequential( + SeluButReluWhenScripted(), + SeluButReluWhenScripted(), + nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()), + shared, + ) + self.module_list = nn.ModuleList([SeluButReluWhenScripted(), + shared, + SeluButReluWhenScripted()]) + + def forward(self, x): + for mod in self.module_list: + x += mod(x) + x += self.sequential(x) + return x + + t = torch.randn(5, 5) + m = M() + eager_out = m(t.clone()) + sm = torch.jit.script(m) + script_out = sm(t.clone()) + self.assertNotEqual(eager_out, script_out) + + def test_prepare_scriptable_cycle(self): + t = torch.randn(5, 5) + c = torch.nn.Module() + p = torch.nn.Module() + c.__dict__["_p"] = p + p.__dict__["_c"] = c + + sm = torch.jit.script(p) + def test_attributes(self): @torch.jit.script class Inner2(object): diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py index 31eec81d480a..7f43b31fe6ec 100644 --- a/test/jit/test_torchbind.py +++ b/test/jit/test_torchbind.py @@ -62,6 +62,32 @@ def f(): return ss1.pop() + ss2.pop() test_equality(f, lambda x: x) + # test nn module with prepare_scriptable function + class NonJitableClass(object): + def __init__(self, int1, int2): + self.int1 = int1 + self.int2 = int2 + + def return_vals(self): + return self.int1, self.int2 + + class CustomWrapper(torch.nn.Module): + def __init__(self, foo): + super(CustomWrapper, self).__init__() + self.foo = foo + + def forward(self) -> None: + self.foo.increment(1) + return + + def __prepare_scriptable__(self): + int1, int2 = self.foo.return_vals() + foo = torch.classes._TorchScriptTesting._Foo(int1, int2) + return CustomWrapper(foo) + + foo = CustomWrapper(NonJitableClass(1, 2)) + jit_foo = torch.jit.script(foo) + def test_torchbind_take_as_arg(self): global StackString # see [local resolution in python] StackString = torch.classes._TorchScriptTesting._StackString diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index b0c7143d0129..26896bc17863 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -1554,6 +1554,35 @@ def forward(self, x, update): update = torch.randn(4, 1, 3, 2) self.run_test(IndexPutModel2(), (x, update)) + @skipIfUnsupportedMinOpsetVersion(11) + def test_index_put_loop(self): + @torch.jit.script + def ngram_attention_bias(sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype): + bias = torch.ones((ngram, sequence_length), device=device, dtype=dtype) * float("-inf") + for stream_idx in range(ngram): + for i in range(sequence_length): + bias[stream_idx, i] = 5 + return bias + + class ScriptModel(torch.nn.Module): + def __init__(self): + super(ScriptModel, self).__init__() + self.ngram = 2 + self.max_target_positions = 512 + + def forward(self, hidden_states): + seq_length, batch_size = hidden_states.shape[:2] + predict_causal_mask = ngram_attention_bias( + self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype + ) + predict_causal_mask = predict_causal_mask[:, :seq_length] + return predict_causal_mask + + x = torch.randn(6, 2) + y = torch.randn(4, 1) + self.run_test(ScriptModel(), x, input_names=['x'], + dynamic_axes={'x': {0: 'seq_length', 1: 'batch_size'}}, test_with_inputs=[y]) + @skipIfUnsupportedMinOpsetVersion(11) def test_copy_(self): class CopyModel(torch.nn.Module): @@ -2105,6 +2134,31 @@ def forward(self, input): model = VarianceUnbiased() self.run_test(model, x) + def test_var_mean_mixed_dims(self): + class ReverseDims(torch.nn.Module): + def forward(self, input): + return torch.var_mean(input, dim=(2, 1), unbiased=False) + + x = torch.randn(2, 3, 4) + model = ReverseDims() + self.run_test(model, x) + + class SkipDims(torch.nn.Module): + def forward(self, input): + return torch.var_mean(input, dim=(0, 2), unbiased=False) + + x = torch.randn(2, 3, 4) + model = SkipDims() + self.run_test(model, x) + + class NonZeroDims(torch.nn.Module): + def forward(self, input): + return torch.var_mean(input, dim=(1, 2), unbiased=False) + + x = torch.randn(2, 3, 4) + model = NonZeroDims() + self.run_test(model, x) + def test_var_mean_keepdim(self): class Variance(torch.nn.Module): def forward(self, input): diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py index 067c35bd3c64..c47982f0c0cc 100644 --- a/test/quantization/test_quantize.py +++ b/test/quantization/test_quantize.py @@ -726,6 +726,20 @@ def forward(self, x): ref_res = ref_m(data) self.assertEqual(res, ref_res) + @skipIfNoFBGEMM + def test_convtranspose_per_channel_fails_early(self): + r""" + Verifies that attempting to quantize a ConvTranspose module with per-Channel + weight observers fails in the prepare step, as opposed to the convert step. + """ + m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1)) + m.qconfig = torch.quantization.get_default_qconfig('fbgemm') + with self.assertRaises(AssertionError) as context: + mp = torch.quantization.prepare(m) + self.assertTrue( + str(context.exception) == + 'Per channel weight observer is not supported yet for ConvTranspose{n}d.') + @skipIfNoFBGEMM class TestPostTrainingDynamic(QuantizationTestCase): diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 545e70a2c5e6..d014bd31f02e 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -1278,6 +1278,21 @@ def test_fp32_input_fp32_output(self): self._test_quantized_inputs_outputs( prepare_custom_config_dict, prepare_count_check, convert_count_check) + @skipIfNoFBGEMM + def test_convtranspose_per_channel_fails_early(self): + r""" + Verifies that attempting to quantize a ConvTranspose module with per-Channel + weight observers fails in the prepare step, as opposed to the convert step. + """ + m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1)) + m.eval() + qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')} + with self.assertRaises(AssertionError) as context: + mp = prepare_fx(m, qconfig_dict) + self.assertTrue( + str(context.exception) == + 'Per channel weight observer is not supported yet for ConvTranspose{n}d.') + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py index c676ccc0f793..a192eddca234 100644 --- a/test/quantization/test_quantized_op.py +++ b/test/quantization/test_quantized_op.py @@ -23,7 +23,7 @@ from torch.testing._internal.common_utils import IS_PPC, TEST_WITH_UBSAN, IS_MACOS from torch.testing._internal.common_quantization import skipIfNoFBGEMM from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \ - override_quantized_engine, supported_qengines, override_qengines + override_quantized_engine, supported_qengines, override_qengines, _snr from torch.testing._internal.common_quantized import qengine_is_qnnpack from torch.quantization import PerChannelMinMaxObserver @@ -2314,6 +2314,87 @@ def test_advanced_indexing(self): torch.quantize_per_tensor(x_fp32_s4, scale, zp, dtype) self.assertEqual(x_q_s4, x_fp32_s4_ref) + @override_qengines + def test_custom_module_lstm(self): + qengine = torch.backends.quantized.engine + + batch_size = 4 + seq_len = 8 + input_size = 12 + + hidden_size = 8 + num_layers = 2 + + dropout = 0 # This is not supported + + Bias = [False, True] + Batch_first = [False, True] + Bidirectional = [False, True] + + dtype = np.uint8 + qtype = torch.quint8 + + custom_module_config = { + 'float_to_observed_custom_module_class': { + torch.nn.LSTM: torch.nn.quantizable.LSTM + } + } + + x = np.random.randn(seq_len, batch_size, input_size) + scale, zero_point = _calculate_dynamic_qparams(x, dtype=dtype) + x = torch.from_numpy(x).to(torch.float) + qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, + dtype=qtype) + x = qx.dequantize() + + with torch.no_grad(): + for bias, batch_first, bidirectional in itertools.product( + Bias, Batch_first, Bidirectional): + # Assume 12dB is sufficient for functional equivalence + # Without the bias, linear performs poorly + min_power = 10 if bias else 5 + max_mse = 5e-6 if bias else 5e-1 + + if batch_first: + x = x.reshape(batch_size, seq_len, input_size) + qx = qx.reshape(batch_size, seq_len, input_size) + else: + x = x.reshape(seq_len, batch_size, input_size) + qx = qx.reshape(seq_len, batch_size, input_size) + + lstm = torch.nn.Sequential( + torch.nn.LSTM(input_size, hidden_size, + num_layers=num_layers, + bias=bias, batch_first=batch_first, + dropout=dropout, + bidirectional=bidirectional)) + lstm.eval() + y_ref = lstm(x) + + # Prepare + lstm.qconfig = torch.quantization.get_default_qconfig(qengine) + lstm_prepared = torch.quantization.prepare( + lstm, prepare_custom_config_dict=custom_module_config) + self.assertTrue(hasattr(lstm_prepared[0], 'layers')) + self.assertEqual(num_layers, len(lstm_prepared[0].layers)) + + # Calibrate + y = lstm_prepared(x) + self.assertEqual(y_ref, y) + + # Quantize + lstm_quantized = torch.quantization.convert(lstm_prepared) + qy = lstm_quantized(qx) + + snr = _snr(y, qy) + snr = [snr[0]] + snr[1] + + for signal, mse, power in snr: + self.assertTrue( + power > min_power or mse < max_mse, + msg=(f"Error is too high: SNR(dB): {power}, " + f"Signal: {signal}, MSE: {mse}")) + class TestDynamicQuantizedLinear(TestCase): """Tests the correctness of the dynamic quantized linear and linear_relu op.""" @@ -3346,7 +3427,7 @@ def _make_qconv_tensors( self, batch_size, input_channels_per_group, input_feature_map_shape, output_channels_per_group, groups, kernels, strides, pads, dilations, X_scale, X_zero_point, W_scale, W_zero_point, - use_bias, use_channelwise, use_transpose, memory_format=torch.contiguous_format + use_bias, use_channelwise, use_transpose ): assert not (use_channelwise and use_transpose), \ "Cannot generate channelwise qconv_transpose_tensors " @@ -3394,7 +3475,6 @@ def _make_qconv_tensors( (batch_size, input_channels,) + input_feature_map_shape, ) X = X_scale * (X_init - X_zero_point).float() - X = X.to(memory_format=memory_format) if use_channelwise: W_shape = (-1, 1) + (1,) * len(kernels) @@ -3427,15 +3507,13 @@ def _test_qconv_impl( input_channels_per_group, input_feature_map_shape, output_channels_per_group, groups, kernels, strides, pads, o_pads, dilations, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale, - Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose, - memory_format=torch.contiguous_format + Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose ): (X, W), (X_q, W_q), bias_float = self._make_qconv_tensors( batch_size, input_channels_per_group, input_feature_map_shape, output_channels_per_group, groups, kernels, strides, pads, dilations, X_scale, X_zero_point, W_scale, - W_zero_point, use_bias, use_channelwise, use_transpose, - memory_format) + W_zero_point, use_bias, use_channelwise, use_transpose) # Assign weights W = W_q.dequantize() X = X_q.dequantize() @@ -3483,14 +3561,6 @@ def _test_qconv_impl( pads: {pads}, o_pads: {o_pads}, dilations: {dilations}, groups: {groups}, y_s: {Y_scale}, y_zp: {Y_zero_point}''') - # fbgemm for now forces output to be NHWC (channels last) to opportunistically - # improve performance - if torch.backends.quantized.engine == 'qnnpack': - # Make sure memory format is preserved - self.assertEqual( - X_q.is_contiguous(memory_format=memory_format), - Y_q.is_contiguous(memory_format=memory_format)) - # Return the quantized data for later reuse return X_q, W_q, bias_float @@ -3563,14 +3633,12 @@ def test_qconv2d( dilations, groups, ) - for memory_format in (torch.contiguous_format, torch.channels_last): - self._test_qconv_impl( - qconv, qconv_prepack, conv_op, batch_size, - input_channels_per_group, (height, width), - output_channels_per_group, groups, kernels, strides, pads, None, - dilations, X_scale, X_zero_point, W_scale, W_zero_point, - Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False, - memory_format) + self._test_qconv_impl( + qconv, qconv_prepack, conv_op, batch_size, + input_channels_per_group, (height, width), + output_channels_per_group, groups, kernels, strides, pads, None, + dilations, X_scale, X_zero_point, W_scale, W_zero_point, + Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False) """Tests the correctness of quantized convolution op.""" @given(batch_size=st.integers(1, 3), @@ -4163,7 +4231,6 @@ def test_qconv3d_unpack( (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), (o_pad, o_pad, o_pad), channelwise) - class TestPadding(TestCase): @given(batch_size=st.integers(1, 64), channels=st.integers(1, 64), diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index 22751697cd1d..8a70ae149c29 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -10,6 +10,7 @@ PlaceholderObserver, NoopObserver, FakeQuantize, + FixedQParamsFakeQuantize, default_debug_qconfig, default_observer, default_per_channel_weight_observer, @@ -504,6 +505,20 @@ def test_observer_qparams_respects_device_affinity(self): self.assertEqual(x.device, scale.device) self.assertEqual(x.device, zero_point.device) + def test_zero_numel(self): + obs_list = [MinMaxObserver, MovingAverageMinMaxObserver, + PerChannelMinMaxObserver, + MovingAveragePerChannelMinMaxObserver, HistogramObserver, + FakeQuantize, FixedQParamsFakeQuantize] + for obs_cls in obs_list: + if obs_cls is FixedQParamsFakeQuantize: + obs = obs_cls(0.1, 0) + else: + obs = obs_cls() + x = torch.Tensor() + # verify no crash + x = obs(x) + # HistogramObserver that works like it does on master class _ReferenceHistogramObserver(HistogramObserver): diff --git a/test/run_test.py b/test/run_test.py index e13753e93348..93484f7a583e 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -19,6 +19,7 @@ from typing import Dict, Optional TESTS = [ + 'test_type_hints', 'test_autograd', 'benchmark_utils/test_benchmark_utils', 'test_binary_ufuncs', @@ -72,7 +73,6 @@ 'test_testing', 'test_torch', 'test_type_info', - 'test_type_hints', 'test_unary_ufuncs', 'test_utils', 'test_view_ops', diff --git a/test/test_autograd.py b/test/test_autograd.py index 2107bfb3eb15..9f5925212757 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -1946,60 +1946,6 @@ def test_slice_expanded_v(self): expected[3:5] = v_expanded self.assertEqual(result, expected) - def test_stack(self): - x = torch.randn(10, 10, requires_grad=True) - y = torch.randn(10, 10, requires_grad=True) - z = torch.randn(10, 10, requires_grad=True) - stacked = torch.stack([x, y, z], 0) - grad = torch.randn(3, 10, 10) - stacked.backward(grad) - self.assertEqual(x.grad, grad[0]) - self.assertEqual(y.grad, grad[1]) - self.assertEqual(z.grad, grad[2]) - - def test_hstack(self): - x = torch.randn(10, 10, requires_grad=True) - y = torch.randn(10, 10, requires_grad=True) - z = torch.randn(10, 10, requires_grad=True) - stacked = torch.hstack([x, y, z]) - grad = torch.randn(10, 30) - stacked.backward(grad) - self.assertEqual(x.grad, grad[:, 0:10]) - self.assertEqual(y.grad, grad[:, 10:20]) - self.assertEqual(z.grad, grad[:, 20:30]) - - x = torch.randn(10, requires_grad=True) - y = torch.randn(10, requires_grad=True) - z = torch.randn(10, requires_grad=True) - stacked = torch.hstack([x, y, z]) - grad = torch.randn(30) - stacked.backward(grad) - self.assertEqual(x.grad, grad[0:10]) - self.assertEqual(y.grad, grad[10:20]) - self.assertEqual(z.grad, grad[20:30]) - - def test_vstack(self): - x = torch.randn(10, 10, requires_grad=True) - y = torch.randn(10, 10, requires_grad=True) - z = torch.randn(10, 10, requires_grad=True) - stacked = torch.vstack([x, y, z]) - grad = torch.randn(30, 10) - stacked.backward(grad) - self.assertEqual(x.grad, grad[0:10]) - self.assertEqual(y.grad, grad[10:20]) - self.assertEqual(z.grad, grad[20:30]) - - def test_dstack(self): - x = torch.randn(10, 10, requires_grad=True) - y = torch.randn(10, 10, requires_grad=True) - z = torch.randn(10, 10, requires_grad=True) - stacked = torch.dstack([x, y, z]) - grad = torch.randn(10, 10, 3) - stacked.backward(grad) - self.assertEqual(x.grad, grad[:, :, 0]) - self.assertEqual(y.grad, grad[:, :, 1]) - self.assertEqual(z.grad, grad[:, :, 2]) - def test_unbind(self): stacked = torch.randn(3, 10, 10, requires_grad=True) x, y, z = stacked.unbind() @@ -2971,6 +2917,20 @@ def run_test(input_size, norm_deg): run_test((10,), 3) run_test((10,), 1) run_test((10,), 1.5) + run_test((10,), inf) + + def test_norm_inf_subgradient(self): + def run_test(input, expected, dim=None): + x = torch.tensor(input, requires_grad=True) + out = x.norm(inf, dim=dim, keepdim=True) + out.backward(torch.ones(out.size())) + self.assertEqual(x.grad, expected) + + run_test([0., 0., 0.], [0., 0., 0.]) + run_test([1., 0., 1.], [0.5, 0., 0.5]) + run_test([[1., 0., 1.], [0., 1., 1.]], [[0.25, 0., 0.25], [0., 0.25, 0.25]]) + run_test([[1., 0., 1.], [0., 1., 0.]], [[0.5, 0., 0.5], [0., 1., 0.]], (1,)) + run_test(torch.ones((2, 2, 2)), torch.full((2, 2, 2), 0.25), (0, 2)) def test_pow_zero_tensor_gradient(self): def run_test(input_size, exponent): @@ -4993,14 +4953,6 @@ def test_linalg_qr_r(self): "linalg_qr_backward: cannot compute backward"): b.backward() - -def index_variable(shape, max_indices): - if not isinstance(shape, tuple): - shape = (shape,) - index = torch.rand(*shape).mul_(max_indices).floor_().long() - return index - - def index_perm_variable(shape, max_indices): if not isinstance(shape, tuple): shape = (shape,) @@ -5008,20 +4960,6 @@ def index_perm_variable(shape, max_indices): index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape) return index - -def gather_variable(shape, index_dim, max_indices, duplicate=False): - assert len(shape) == 2 - assert index_dim < 2 - batch_dim = 1 - index_dim - index = torch.LongTensor(*shape) - for i in range(shape[index_dim]): - index.select(index_dim, i).copy_( - torch.randperm(max_indices)[:shape[batch_dim]]) - if duplicate: - index.select(batch_dim, 0).copy_(index.select(batch_dim, 1)) - return index - - def bernoulli_scalar(): return torch.tensor(0, dtype=torch.uint8).bernoulli_() @@ -5097,7 +5035,8 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks, 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul', 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub', 'exp', 'mean', 'inverse', 'triangular_solve', 'solve', 'addcmul', - 'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr', ] + separate_complex_tests + 'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr', + 'narrow', 'swapaxes', 'swapdims', 'tensor_split', 'tile'] + separate_complex_tests def add_test( name, @@ -7369,18 +7308,6 @@ def test_strided_leaf_grad_layout(self, device): (c * d).sum().backward() self.assertEqual(c.grad.stride(), (2, 1)) - def test_movedim(self, device): - for fn in [torch.movedim, torch.moveaxis]: - x = torch.randn(4, 3, 2, 1, dtype=torch.double, device=device, requires_grad=True) - - # Positive axis - gradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x) - gradgradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x) - - # Negative axis - gradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x) - gradgradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x) - def _test_atleast(self, device, torch_fn): # 0-dim s = torch.tensor(0.5, dtype=torch.double, requires_grad=True) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index 5739fb569628..2ff12396701e 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -1082,13 +1082,13 @@ def test_maximum_minimum_cross_device(self, device): ops = (torch.maximum, torch.minimum) for torch_op in ops: - with self.assertRaisesRegex(RuntimeError, + with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): torch_op(a, b) - with self.assertRaisesRegex(RuntimeError, + with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): - torch_op(b, a) + torch_op(b, a) # test cuda tensor and cpu scalar ops = ((torch.maximum, np.maximum), (torch.minimum, np.minimum)) @@ -2560,6 +2560,17 @@ def inplace_variant_helper(x, y): self.compare_with_numpy(torch_fn, reference_fn, t, exact_dtype=False) out_variant_helper(torch.xlogy, 0, t) + def test_xlogy_scalar_type_promotion(self, device): + # Test that python numbers don't participate in type promotion at the same + # priority level as 0-dim tensors + t = torch.randn((), dtype=torch.float32, device=device) + + self.assertEqual(t.dtype, torch.xlogy(t, 5).dtype) + self.assertEqual(t.dtype, torch.xlogy(t, 5.).dtype) + + self.assertEqual(t.dtype, torch.xlogy(5, t).dtype) + self.assertEqual(t.dtype, torch.xlogy(5., t).dtype) + @skipIf(not TEST_SCIPY, "Scipy required for the test.") def test_xlogy_bfloat16(self, device): def _compare_helper(x, y): diff --git a/test/test_fx.py b/test/test_fx.py index 5e285039a6dd..65d5aa3f0101 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -1196,5 +1196,18 @@ def forward(self, x): input = torch.rand(3, 4) self.assertEqual(traced(input), Pair(input, input)) + def test_return_type_exists(self): + class ReturnTypeModule(torch.nn.Module): + def other(self, x: List[str]) -> List[str]: + return x + + def forward(self, x: List[str]) -> List[str]: + return self.other(x) + + traced = symbolic_trace(ReturnTypeModule()) + self.assertIn("-> typing.List[str]", traced._code) + scripted = torch.jit.script(traced) + self.assertIn("-> List[str]", scripted.code) + if __name__ == '__main__': run_tests() diff --git a/test/test_jit.py b/test/test_jit.py index ff89429534ac..a683a8eb0b8c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -35,6 +35,7 @@ from jit.test_slice import TestSlice # noqa: F401 from jit.test_warn import TestWarn # noqa: F401 from jit.test_isinstance import TestIsinstance # noqa: F401 +from jit.test_cuda import TestCUDA # noqa: F401 from jit.test_hash import TestHash # noqa: F401 # Torch diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py index 75b486043c42..81b33c5900db 100644 --- a/test/test_multiprocessing.py +++ b/test/test_multiprocessing.py @@ -3,7 +3,6 @@ import os import sys import time -import subprocess import unittest import copy from sys import platform @@ -525,7 +524,7 @@ def test_cuda_bad_call(self): @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)') @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available') def test_wrong_cuda_fork(self): - results = self.run_process_no_exception("""\ + stderr = TestCase.runWithPytorchAPIUsageStderr("""\ import torch from torch.multiprocessing import Process def run(rank): @@ -542,7 +541,7 @@ def run(rank): for p in processes: p.join() """) - self.assertRegex(results[1].decode('ascii'), "Cannot re-initialize CUDA in forked subprocess.") + self.assertRegex(stderr, "Cannot re-initialize CUDA in forked subprocess.") @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ don't support multiprocessing with spawn start method") @@ -831,15 +830,6 @@ def test_cuda_parameter_sharing(self): param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5)) self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True) - @staticmethod - def run_process_no_exception(code): - popen = subprocess.Popen( - [sys.executable, '-c', code], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - pipes = popen.communicate() - return pipes - @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ don't support multiprocessing with spawn start method") def test_integer_parameter_serialization(self): diff --git a/test/test_nn.py b/test/test_nn.py index 1d63be6e3075..386ba369dca6 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -9283,18 +9283,19 @@ def test_flatten(self): def test_unflatten(self): tensor_input = torch.randn(2, 50) - # Unflatten Tensor + # Unflatten Tensor (unflattened_size as a tuple of ints and list of ints) - unflatten = nn.Unflatten(dim=1, unflattened_size=(2, 5, 5)) - tensor_output = unflatten(tensor_input) - self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5])) + for us in ((2, 5, 5), [2, 5, 5]): + unflatten = nn.Unflatten(dim=1, unflattened_size=us) + tensor_output = unflatten(tensor_input) + self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5])) # Unflatten NamedTensor unflatten = nn.Unflatten(dim='features', unflattened_size=(('C', 2), ('H', 5), ('W', 5))) named_tensor_input = tensor_input.refine_names('N', 'features') named_tensor_output = unflatten(named_tensor_input) - self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5])) + self.assertEqual(named_tensor_output.size(), torch.Size([2, 2, 5, 5])) def test_unflatten_invalid_arg(self): # Wrong type for unflattened_size (tuple of floats) @@ -9304,6 +9305,13 @@ def test_unflatten_invalid_arg(self): r"unflattened_size must be tuple of ints, but found element of type float at pos 2"): nn.Unflatten(dim=1, unflattened_size=(2, 5, 5.0)) + # Wrong type for unflattened_size (list of lists and list of tuples) + for us in ([['C', 2], ['W', 5], ['H', 5]], [('C', 2), ('W', 5), ('H', 5)]): + with self.assertRaisesRegex( + TypeError, + r"unflattened_size must be a tuple of tuples, but found type list"): + nn.Unflatten(dim='features', unflattened_size=us) + # Wrong type for unflattened_size (tuple of lists) with self.assertRaisesRegex( @@ -9311,19 +9319,12 @@ def test_unflatten_invalid_arg(self): r"unflattened_size must be tuple of tuples, but found element of type list at pos 0"): nn.Unflatten(dim='features', unflattened_size=(['C', 2], ['W', 5], ['H', 5])) - # Wrong type for unflattened_size (list of ints) - - with self.assertRaisesRegex( - TypeError, - r"unflattened_size must be a tuple of ints, but found type list"): - nn.Unflatten(dim=1, unflattened_size=[2, 5, 5]) - - # Wrong type for unflattened_size (list of lists) + # Wrong type for unflattened_size (tuple of dicts) with self.assertRaisesRegex( TypeError, - r"unflattened_size must be a tuple of tuples, but found type list"): - nn.Unflatten(dim='features', unflattened_size=[['C', 2], ['W', 5], ['H', 5]]) + r"unflattened_size must be tuple of tuples, but found element of type dict at pos 0"): + nn.Unflatten(dim='features', unflattened_size=({'C': 2}, {'W': 5}, {'H': 5})) def test_layer_norm_grads_with_create_graph_flag(self): atol = 1e-5 diff --git a/test/test_overrides.py b/test/test_overrides.py index 95f94504d84e..f32b04cb2e53 100644 --- a/test/test_overrides.py +++ b/test/test_overrides.py @@ -563,6 +563,8 @@ def instance_gen(): func_args.append(instance_gen()) elif t == 'TensorList': func_args.append([instance_gen(), instance_gen()]) + elif t == 'c10::List>': + func_args.append([instance_gen(), instance_gen()]) elif t == 'IntArrayRef': size = arg.get('size', 2) if size == 1: diff --git a/test/test_quantization.py b/test/test_quantization.py index f68bfcd058b6..1c370913c6d0 100644 --- a/test/test_quantization.py +++ b/test/test_quantization.py @@ -15,6 +15,7 @@ from quantization.test_quantized_op import TestPadding # noqa: F401 from quantization.test_quantized_op import TestQuantizedEmbeddingOps # noqa: F401 from quantization.test_quantized_op import TestDynamicQuantizedRNNOp # noqa: F401 + # Quantized Functional from quantization.test_quantized_functional import TestQuantizedFunctional # noqa: F401 diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py index 43321508e0e2..f7da08eb24d7 100644 --- a/test/test_shape_ops.py +++ b/test/test_shape_ops.py @@ -378,21 +378,31 @@ def test_flip(self, device): self.assertEqual(size, list(data.flip(ds).size())) # test rectangular case - data = torch.tensor([1, 2, 3, 4, 5, 6]).view(2, 3).to(device) - flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]]).to(device) - flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]]).to(device) + data = torch.tensor([1, 2, 3, 4, 5, 6], device=device).view(2, 3) + flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]], device=device) + flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]], device=device) self.assertEqual(flip0_result, data.flip(0)) self.assertEqual(flip1_result, data.flip(1)) # test empty tensor, should just return an empty tensor of the same shape - data = torch.tensor([]) + data = torch.tensor((), device=device) self.assertEqual(data, data.flip(0)) # test bool tensor - a = torch.tensor([False, True]) + a = torch.tensor([False, True], device=device) self.assertEqual(a.flip(0), torch.tensor([True, False])) + # case: dims=() + a = torch.randn(3, 2, 1, device=device) + if device == 'cpu': + self.assertEqual(a.flip(dims=()), a) + else: + # Reference: https://github.com/pytorch/pytorch/issues/49982 + with self.assertRaisesRegex(IndexError, + "flip dims size out of range, got flip dims size=0"): + a.flip(dims=()) + def _rand_shape(self, dim, min_size, max_size): shape = [] for i in range(dim): diff --git a/test/test_sparse.py b/test/test_sparse.py index 6daf3f1931d2..4e982b8333d9 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -3168,6 +3168,14 @@ def different_dtypes(): test_sparse_matmul(2, 0, [0, 10], [10, 0]) test_error_cases() + def test_assign(self): + def assign_to(a): + a, i_a, v_a = self._gen_sparse(2, 5, [2, 3]) + a[0] = 100 + + self.assertRaises(TypeError, assign_to) + + class TestUncoalescedSparse(TestSparse): def setUp(self): super(TestUncoalescedSparse, self).setUp() diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index 6192d6c4d6b6..085af5294a04 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -225,13 +225,13 @@ def test_empty_fft(self, device, dtype): def test_fft_invalid_dtypes(self, device): t = torch.randn(64, device=device, dtype=torch.complex128) - with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + with self.assertRaisesRegex(RuntimeError, "rfft expects a real input tensor"): torch.fft.rfft(t) with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input tensor"): torch.fft.rfftn(t) - with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + with self.assertRaisesRegex(RuntimeError, "ihfft expects a real input tensor"): torch.fft.ihfft(t) @skipCUDAIfRocm @@ -332,6 +332,27 @@ def test_fft_backward(self, device, dtype): args = args[1:] self._fft_grad_check_helper(fname, input, args) + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + def test_fft_invalid_out_types(self, device): + + complex_fft_funcs = [torch.fft.fft, torch.fft.ifft, torch.fft.fftn, torch.fft.ifftn, + torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft] + real_fft_funcs = [torch.fft.irfft, torch.fft.irfftn, torch.fft.hfft] + fft_funcs = complex_fft_funcs + real_fft_funcs + + # Test errors on invalid out dtypes + x = torch.rand(10, device=device, dtype=torch.float32) + for out_dtype, funcs in [(torch.int16, fft_funcs), + (torch.float32, complex_fft_funcs), + (torch.complex64, real_fft_funcs)]: + out = torch.empty((), device=device, dtype=out_dtype) + + for func in funcs: + with self.assertRaisesRegex(RuntimeError, "expects a .* output tensor"): + func(x, out=out) + # nd-fft tests @skipCPUIfNoMkl @@ -463,10 +484,10 @@ def test_fftn_invalid(self, device): torch.fft.rfftn, torch.fft.irfftn) for func in fft_funcs: - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(0, 1, 0)) - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(2, -1)) with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"): @@ -578,10 +599,10 @@ def test_fft2_invalid(self, device): torch.fft.rfft2, torch.fft.irfft2) for func in fft_funcs: - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(0, 0)) - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(2, -1)) with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"): @@ -623,6 +644,19 @@ def test_fftfreq_numpy(self, device, dtype): actual = torch_fn(*args, device=device, dtype=dtype) self.assertEqual(actual, expected, exact_dtype=False) + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @dtypes(torch.float, torch.double) + def test_fftfreq_out(self, device, dtype): + for func in (torch.fft.fftfreq, torch.fft.rfftfreq): + expect = func(n=100, d=.5, device=device, dtype=dtype) + actual = torch.empty((), device=device, dtype=dtype) + with self.assertWarnsRegex(UserWarning, "out tensor will be resized"): + func(n=100, d=.5, out=actual) + self.assertEqual(actual, expect) + + @skipCPUIfNoMkl @skipCUDAIfRocm @onlyOnCPUAndCUDA @@ -1066,10 +1100,12 @@ def test_complex_stft_onesided(self, device): with self.assertRaisesRegex(RuntimeError, 'complex'): x.stft(10, pad_mode='constant', onesided=True) + # stft is currently warning that it requires return-complex while an upgrader is written def test_stft_requires_complex(self, device): x = torch.rand(100) - with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'): - y = x.stft(10, pad_mode='constant') + y = x.stft(10, pad_mode='constant') + # with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'): + # y = x.stft(10, pad_mode='constant') @skipCUDAIfRocm @skipCPUIfNoMkl diff --git a/test/test_testing.py b/test/test_testing.py index 8cdca871185b..4ff215233fe2 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -442,10 +442,9 @@ def test_assert_messages(self, device): @slowTest def test_cuda_assert_should_stop_test_suite(self, device): # This test is slow because it spawn another process to run another test suite. - import subprocess - import sys - problematic_test_script = """\ + # Test running of cuda assert test suite should early terminate. + stderr = TestCase.runWithPytorchAPIUsageStderr("""\ #!/usr/bin/env python import torch @@ -479,14 +478,12 @@ def test_trivial_passing_test_case_on_cpu_cuda(self, device): if __name__ == '__main__': run_tests() -""" - - # Test running of cuda assert test suite should early terminate. - p = subprocess.run([sys.executable, '-c', problematic_test_script], stderr=subprocess.PIPE, timeout=120) +""") # should capture CUDA error - self.assertIn('CUDA error: device-side assert triggered', p.stderr.decode('ascii')) + self.assertIn('CUDA error: device-side assert triggered', stderr) # should run only 1 test because it throws unrecoverable error. - self.assertIn('Ran 1 test', p.stderr.decode('ascii')) + self.assertIn('Ran 1 test', stderr) + instantiate_device_type_tests(TestTesting, globals()) diff --git a/test/test_torch.py b/test/test_torch.py index 6532c2e5e17d..1f85ed2fff54 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -937,10 +937,6 @@ def test_index_add_all_dtypes(self): # index_add calls atomicAdd on cuda. zeros = torch.zeros(size, dtype=dtype, device=device) - # index_add is not supported for complex dtypes on cuda yet - if device.startswith('cuda') and dtype.is_complex: - continue - added = zeros.index_add(0, torch.arange(0, size[0], dtype=idx_dtype, device=device), tensor) self.assertEqual(added, tensor) @@ -6870,7 +6866,6 @@ def inner(self, device, dtype): ('rot90', 'k1_d12', _small_3d, lambda t, d: [1, [1, 2]], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False), ('rot90', 'k1_neg_d', _small_3d, lambda t, d: [1, [1, -1]], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False), ('rot90', 'default', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False), - ('rsqrt', '', lambda t, d: _small_3d(t, d) + 1, lambda t, d: [], 1e-2, 1e-5, 1e-4, _float_types_no_half), ('sinh', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types), ('tan', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types), ('tan', 'complex', lambda t, d: _small_3d(t, d), lambda t, d: [], 1e-3, 1e-5, 1e-5, _complex_types), diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 776482306f4d..960991a4820b 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -642,14 +642,6 @@ def test_sign_complex_assert_raises(self, device): size = [5, 5] tensor = torch.rand(size, dtype=dtype, device=device) - # index_add calls atomicAdd on cuda. - zeros = torch.zeros(size, dtype=dtype, device=device) - - # index_add is not supported for complex dtypes on cuda yet - if device.startswith('cuda') and dtype.is_complex: - self.assertRaises(RuntimeError, - lambda: zeros.index_add(0, torch.arange(0, size[0], dtype=torch.long, device=device), tensor)) - with self.assertRaisesRegex(RuntimeError, (r'Unlike NumPy, torch.sign is not intended to support complex numbers\. ' r'Please use torch.sgn instead\.')): @@ -1715,7 +1707,6 @@ def _medium_2d(dtype, device): _TorchMathTestMeta('ceil'), _TorchMathTestMeta('rad2deg'), _TorchMathTestMeta('deg2rad'), - _TorchMathTestMeta('rsqrt', reffn=lambda x: np.reciprocal(np.sqrt(x))), _TorchMathTestMeta('frac', reffn='fmod', refargs=lambda x: (x.numpy(), 1)), _TorchMathTestMeta('trunc'), _TorchMathTestMeta('round'), diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 3a1411d1a167..be33aa1ab44a 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -100,6 +100,12 @@ def is_view_of(self, base, other): return True + # Returns true if v1 and v2 are views of the same base + def is_view_of_same_base(self, v1, v2): + if (not v1._is_view() or v1 is v2): + return False + return self.is_view_of(v1._base, v2) + # Performs transpose if contiguous=True, else returns the input tensor as is def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1): if contiguous: @@ -457,6 +463,64 @@ def test_reshape_nonview(self, device): nv[6] = 0 self.assertNotEqual(t[1, 1], nv[6]) + def test_flatten_view(self, device): + def test_writes_propagate(t, v): + idx_t = (0,) * t.ndim + idx_v = (0,) * v.ndim + v[idx_v] = 0 + self.assertEqual(t[idx_t], v[idx_v]) + + t = torch.ones(1, 2, 3, 4, device=device) + v = t.flatten() + self.assertTrue(self.is_view_of(t, v)) + test_writes_propagate(t, v) + + # zero-dimensional tensor + t = torch.tensor(1, device=device) + v = t.flatten() + test_writes_propagate(t, v) + self.assertTrue(self.is_view_of(t, v)) + + t = torch.ones(1, 2, 3, 4, device=device).transpose(2, 3) + v = t.flatten(0, 1) + test_writes_propagate(t, v) + self.assertTrue(self.is_view_of_same_base(t, v)) + + # stride[i] = stride[i + 1] * size[i + 1] is satisfied for 3 groups: + t = torch.ones(720, device=device) \ + .as_strided((2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0)) + # [--1--|---2---|-3-] [--1--|----2---|-3-] + v1 = t.flatten(0, 1) + v2 = v1.flatten(1, 3) + v3 = v2.flatten(2, 2) + test_writes_propagate(t, v1) + self.assertTrue(self.is_view_of_same_base(t, v1)) + test_writes_propagate(t, v2) + self.assertTrue(self.is_view_of_same_base(t, v2)) + test_writes_propagate(t, v3) + self.assertTrue(self.is_view_of_same_base(t, v3)) + + @onlyOnCPUAndCUDA + def test_flatten_nonview(self, device): + def assert_is_nonview(t, nv): + idx_t = (0,) * t.ndim + idx_nv = (0,) * nv.ndim + self.assertTrue(not nv._is_view()) + nv[idx_nv] = 0 + self.assertNotEqual(t[idx_t], nv[idx_nv]) + t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3) + nv = t.flatten(1, 3) + assert_is_nonview(t, nv) + + t = torch.ones(2, 2, device=device).T + nv = t.flatten() + assert_is_nonview(t, nv) + + # flatten returns the original object if start_dim=end_dim + t = t = torch.ones(2, 2, device=device) + nv = t.flatten(1, 1) + self.assertTrue(t is nv) + def test_basic_indexing_slice_view(self, device): t = torch.ones(5, 5, device=device) v = t[:2, :3] diff --git a/test/test_vmap.py b/test/test_vmap.py index cc25dff3b306..b722fc126b24 100644 --- a/test/test_vmap.py +++ b/test/test_vmap.py @@ -1907,6 +1907,16 @@ def test_split(self): test(vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)), (torch.rand(B1, 2, B0, 64, B2),), in_dims=2) + def test_trace(self): + op = torch.trace + test = self._vmap_test + B0, B1, B2 = 7, 11, 13 + + test(op, (torch.rand(B0, 2, 5),)) + test(op, (torch.rand(2, B0, 5),), in_dims=1) + test(vmap(op), (torch.rand(B1, 2, B0, 5),), in_dims=2) + test(vmap(vmap(op, in_dims=2)), (torch.rand(B1, 2, B0, 5, B2),), in_dims=2) + def test_transpose(self): op = torch.transpose test = self._vmap_view_test @@ -2313,6 +2323,10 @@ def test_slice(self, device): self._batched_grad_test(lambda x: x[:, 1:3], (x,)) self._batched_grad_test(lambda x: x[..., 1:3], (x,)) + def test_trace(self, device): + x = torch.randn(2, 3, device=device, requires_grad=True) + self._batched_grad_test(Tensor.trace, (x,)) + @allowVmapFallbackUsage def test_symeig(self, device): def op(x): @@ -2322,6 +2336,11 @@ def op(x): self._batched_grad_test(op, (x,), {}) self._batched_grad_grad_test(op, (x,), {}) + def test_threshold(self, device): + x = torch.randn(2, 3, device=device, requires_grad=True) + self._batched_grad_test(lambda x: F.threshold(x, 0.5, 0.0), (x,)) + + @allowVmapFallbackUsage def test_inplace_view(self, device): leaf = torch.randn(4, 5, requires_grad=True) diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py index 88c00e0ba71a..b930aca504df 100644 --- a/tools/autograd/gen_autograd.py +++ b/tools/autograd/gen_autograd.py @@ -23,9 +23,6 @@ import argparse import os -import yaml -import re -from .utils import YamlLoader, op_name_with_overload from tools.codegen.selective_build.selector import SelectiveBuilder # See NOTE [ Autograd View Variables ] in variable.h for details. @@ -89,84 +86,14 @@ 'tensor_split', 'swapdims', 'swapaxes' }) -def format_return_type(returns): - if len(returns) == 0: - return 'void' - elif len(returns) == 1: - return returns[0]['type'] - else: - return_types = [r['type'] for r in returns] - return 'std::tuple<{}>'.format(','.join(return_types)) - - -def get_simple_type(arg): - simple_type = arg['type'] - simple_type = simple_type.replace(' &', '').replace('const ', '') - simple_type = simple_type.replace('Generator *', 'Generator') - - opt_match = re.match(r'c10::optional<(.+)>', simple_type) - if opt_match: - simple_type = '{}?'.format(opt_match.group(1)) - return simple_type - -def has_tensoroptions_argument(declaration): - for argument in declaration['arguments']: - if 'TensorOptions' == argument['dynamic_type']: - return True - return False - - -def load_aten_declarations(path): - with open(path, 'r') as f: - declarations = yaml.load(f, Loader=YamlLoader) - - # enrich declarations with additional information - selected_declarations = [] - for declaration in declarations: - if declaration.get('deprecated'): - continue - - for arg in declaration['arguments']: - arg['simple_type'] = get_simple_type(arg) - for arg in declaration['schema_order_arguments']: - arg['simple_type'] = get_simple_type(arg) - for ret in declaration['returns']: - ret['simple_type'] = get_simple_type(ret) - - declaration['formals'] = [arg['type'] + ' ' + arg['name'] - for arg in declaration['arguments']] - declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name'] - for arg in declaration['schema_order_arguments']] - declaration['args'] = [arg['name'] for arg in declaration['arguments']] - declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']] - declaration['api_name'] = declaration['name'] - if declaration.get('overload_name'): - declaration['type_wrapper_name'] = "{}_{}".format( - declaration['name'], declaration['overload_name']) - else: - declaration['type_wrapper_name'] = declaration['name'] - declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0] - declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1] - declaration['return_type'] = format_return_type(declaration['returns']) - - declaration['base_name'] = declaration['name'] - selected_declarations.append(declaration) - - return selected_declarations - - -def gen_autograd(aten_path, native_functions_path, out, autograd_dir, operator_selector: SelectiveBuilder, disable_autograd=False): - full_aten_decls = load_aten_declarations(aten_path) - - def filter_decls(aten_decls, operator_selector): - def is_operator_selected_for_training(decl): - op_name = op_name_with_overload(decl) - return operator_selector.is_operator_selected_for_training(op_name) - - return [decl for decl in aten_decls if is_operator_selected_for_training(decl)] - - aten_decls = filter_decls(full_aten_decls, operator_selector) - +def gen_autograd( + aten_path: str, + native_functions_path: str, + out: str, + autograd_dir: str, + operator_selector: SelectiveBuilder, + disable_autograd: bool = False, +) -> None: # Parse and load derivatives.yaml from .load_derivatives import load_derivatives differentiability_infos = load_derivatives( @@ -175,13 +102,13 @@ def is_operator_selected_for_training(decl): template_path = os.path.join(autograd_dir, 'templates') # Generate VariableType.h/cpp + from .gen_trace_type import gen_trace_type + from .gen_variable_type import gen_variable_type if not disable_autograd: - from .gen_variable_type import gen_variable_type - gen_variable_type(out, aten_decls, differentiability_infos, template_path) + gen_variable_type(out, native_functions_path, differentiability_infos, template_path, operator_selector) - from . import gen_trace_type # operator filter not applied as tracing sources are excluded in selective build - gen_trace_type.gen_trace_type(out, native_functions_path, template_path) + gen_trace_type(out, native_functions_path, template_path) # Generate Functions.h/cpp from .gen_autograd_functions import gen_autograd_functions_lib @@ -193,7 +120,12 @@ def is_operator_selected_for_training(decl): gen_variable_factories(out, native_functions_path, template_path) -def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir): +def gen_autograd_python( + aten_path: str, + native_functions_path: str, + out: str, + autograd_dir: str, +) -> None: from .load_derivatives import load_derivatives differentiability_infos = load_derivatives( os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path) @@ -212,7 +144,7 @@ def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir): out, native_functions_path, deprecated_path, template_path) -def main(): +def main() -> None: parser = argparse.ArgumentParser( description='Generate autograd C++ files script') parser.add_argument('declarations', metavar='DECL', diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py index a22154b5c01d..4724b99a8742 100644 --- a/tools/autograd/gen_autograd_functions.py +++ b/tools/autograd/gen_autograd_functions.py @@ -141,7 +141,7 @@ def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str compute_index_ranges: List[str] = [] for arg in info.args_with_derivatives: - if arg.type == 'TensorList': + if arg.type == 'TensorList' or arg.type == 'const c10::List> &': size = f'{arg.name}_size_' saved_list_sizes.append(f'size_t {arg.name}_size_;') else: @@ -166,6 +166,15 @@ def save_var(var: SavedAttribute, is_output: bool) -> None: release_variables.append(f'{name}_released_ = true;') unpack.append(f'auto {name} = unpack_list({name}_);') asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);') + elif var.type == 'c10::List>': + saved_variables.append(f'std::vector {name}_;') + saved_variables.append(f'bool {name}_released_ = false;') + # Just clear() is sufficient, we don't need to loop and clear each variable. + # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well. + release_variables.append(f'{name}_.clear();') + release_variables.append(f'{name}_released_ = true;') + unpack.append(f'auto {name} = unpack_opt_list({name}_);') + asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);') elif var.type == 'IntArrayRef': saved_variables.append(f'std::vector {name};') elif var.type == 'c10::optional': diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 1f61ce3dfa20..0450983a8e41 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -230,7 +230,7 @@ def signature_original(f: NativeFunction) -> str: opname += '_out' if f.func.name.name.inplace and pyi: opname += '_' - args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments() + args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments() # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml. types = ', '.join(argument_type_str(a.argument.type) for a in args if isinstance(a.argument, Argument)) diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index b2dfe2667128..d8d42762e4fb 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -112,9 +112,8 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen ] else: name = arg.name - # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs if str(arg.type) == 'Tensor?[]': - return [f'jit::tracer::addInputs(node, "{name}", {name}, true);'] + return [f'jit::tracer::addInputs(node, "{name}", {name});'] else: return [ADD_TRACE_INPUT.substitute(name=name, input=name)] @@ -122,7 +121,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen if f.use_c10_dispatcher.dispatcher_uses_new_style(): args = list(f.func.schema_order_arguments()) else: - sig_group = CppSignatureGroup.from_schema(f.func, method=False) + sig_group = CppSignatureGroup.from_native_function(f, method=False) args = [cpp_args.argument for cpp_args in sig_group.signature.arguments() if not isinstance(cpp_args.argument, SelfArgument)] @@ -381,7 +380,7 @@ def method_definition(f: NativeFunction) -> Optional[str]: for a in f.func.schema_order_arguments() ) else: - sig_group = CppSignatureGroup.from_schema(f.func, method=False) + sig_group = CppSignatureGroup.from_native_function(f, method=False) formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments()) return METHOD_DEFINITION.substitute( @@ -423,7 +422,7 @@ def gen_trace_type_shard( fm: FileManager, native_functions: Sequence[NativeFunction], suffix: str ) -> None: fm.write_with_template('TraceType%s.cpp' % suffix, 'TraceType.cpp', lambda: { - 'generated_comment': f'@generated from {fm.template_dir}/TraceType.cpp', + 'generated_comment': '@' + f'generated from {fm.template_dir}/TraceType.cpp', 'trace_method_definitions': list(mapMaybe(method_definition, native_functions)), 'trace_wrapper_registrations': list(mapMaybe(method_registration, native_functions)), }) diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py index a8c07aef4181..f8ab30dc4580 100644 --- a/tools/autograd/gen_variable_factories.py +++ b/tools/autograd/gen_variable_factories.py @@ -48,7 +48,7 @@ def process_function(f: NativeFunction) -> Optional[str]: if Variant.function not in f.variants or not is_factory: return None - sig = CppSignatureGroup.from_schema(f.func, method=False).signature + sig = CppSignatureGroup.from_native_function(f, method=False).signature formals: List[str] = [] exprs: List[str] = [] requires_grad = 'false' diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 1d75ae46e9c9..f49f5e15845b 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -22,20 +22,24 @@ # which will in turn dispatch back to VariableType for its # differentiable subcomponents. # +from dataclasses import dataclass -from .utils import CodeTemplate, nested_dict, write, make_out_api_name_faithful from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \ MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT from .gen_autograd_functions import uses_single_grad -from .gen_trace_type import MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD +from .gen_trace_type import ( + MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD, + declare_returned_variables, tie_return_values, get_return_value, type_wrapper_name, +) from tools.codegen.api.types import * from tools.codegen.api.autograd import * import tools.codegen.api.cpp as cpp -import tools.codegen.api.python as python -from tools.codegen.gen import with_native_function +from tools.codegen.code_template import CodeTemplate +from tools.codegen.gen import with_native_function, parse_native_yaml, FileManager, mapMaybe from tools.codegen.model import * -from typing import Dict, Optional, List, Sequence, Any, Callable +from tools.codegen.selective_build.selector import SelectiveBuilder +from typing import Callable, List, Optional, Sequence, Tuple, Union # We don't set or modify grad_fn on these methods. Generally, they return # tensors that have requires_grad=False. In-place functions listed here will @@ -78,8 +82,8 @@ 'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal', 'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_', 'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv', - 'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', 'svd', - '_fft_c2c', '_fft_r2c', 'linalg_solve', 'sqrt' + 'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', 'svd', '_fft_c2c', '_fft_r2c', + 'linalg_solve', 'sqrt', 'stack', 'gather', 'index_select', 'index_add_' } # Some operators invalidate the grad_accumulator. Let's reset it. @@ -118,6 +122,21 @@ } """) +SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\ +std::vector> ${tensorlist_name}_storage_saved(${tensorlist_name}.size()); +for (const c10::optional& tensor : ${tensorlist_name}) + ${tensorlist_name}_storage_saved.push_back( + tensor.has_value() && tensor->has_storage() ? c10::optional(tensor->storage()) : c10::nullopt); +""") + +ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\ +for (size_t i=0; i<${tensorlist_name}.size(); i++) { + if (${tensorlist_name}_storage_saved[i].has_value()) + AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of( + static_cast>(${tensorlist_name}[i])->storage())); +} +""") + SAVE_TENSOR_IMPL = CodeTemplate("""\ c10::intrusive_ptr ${tensor_name}_impl_saved; if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr(); @@ -140,6 +159,21 @@ } """) +SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\ +std::vector> ${tensorlist_name}_impl_saved(${tensorlist_name}.size()); +for (size_t i=0; i<${tensorlist_name}.size(); i++) { + c10::optional t = ${tensorlist_name}[i]; + if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr(); +} +""") + +ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\ +for (size_t i=0; i<${tensorlist_name}.size(); i++) { + if (${tensorlist_name}_impl_saved[i]) + AT_ASSERT(${tensorlist_name}_impl_saved[i] == static_cast>(${tensorlist_name}[i])->getIntrusivePtr()); +} +""") + # The following list contains functions that we don't enforce the invariant on. DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = { # These functions are expected to change impl or storage of input tensors @@ -179,9 +213,6 @@ UNPACK_TENSOR = CodeTemplate("""\ auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""") -LEGACY_WRAP_OPTIONS = CodeTemplate("""\ -auto ${arg_name}_ = TensorOptions(${arg_name});""") - DECLARE_GRAD_FN = CodeTemplate("""\ std::shared_ptr<${op}> grad_fn; """) @@ -274,49 +305,18 @@ #endif """) -# Methods shared by TraceType and VariableType to handle return variable declaration, tie and tuple. -def format_return_variables(declaration): - name = declaration['name'] - arguments = declaration['arguments'] - inplace = declaration['inplace'] - is_out_fn = name.endswith('_out') - modifies_arguments = inplace or is_out_fn - - def declare_returned_variables(): - if modifies_arguments: - return '' - if len(declaration['returns']) == 1: - return '' - # TODO: this will be ugly - names = [ret['type'] + ' ' + ret['name'] + ';' for ret in declaration['returns']] - return '\n'.join(names) - - def tie_return_values(): - if len(declaration['returns']) == 1: - return 'auto {}'.format(declaration['returns'][0]['name']) - names = [ret['name'] for ret in declaration['returns']] - return 'std::tie({})'.format(', '.join(names)) - - def get_return_value(): - if inplace: - return 'self' - if is_out_fn: - return_names = [arg['name'] for arg in arguments - if arg.get('output', False)] - if len(return_names) == 1: - return return_names[0] - return 'std::forward_as_tuple({})'.format(', '.join(return_names)) - - returns = declaration['returns'] - if len(returns) == 1: - return returns[0]['name'] - moved = ['std::move({})'.format(r['name']) for r in returns] - return 'std::make_tuple({})'.format(', '.join(moved)) - - return (declare_returned_variables(), tie_return_values(), get_return_value()) - +@dataclass(frozen=True) +class NativeFunctionWithDifferentiabilityInfo: + func: NativeFunction + info: Optional[DifferentiabilityInfo] -def gen_variable_type(out, aten_declarations, differentiability_infos, template_path): +def gen_variable_type( + out: str, + native_yaml_path: str, + differentiability_infos: Sequence[DifferentiabilityInfo], + template_path: str, + operator_selector: SelectiveBuilder, +) -> None: """VariableType.h and VariableType.cpp body @@ -324,153 +324,202 @@ def gen_variable_type(out, aten_declarations, differentiability_infos, template_ implementation of each function dispatches to the base tensor type to compute the output. The grad_fn is attached to differentiable functions. """ + fns = list(sorted(filter( + operator_selector.is_native_function_selected_for_training, + parse_native_yaml(native_yaml_path)), key=lambda f: cpp.name(f.func))) + fns_with_infos = match_differentiability_info(fns, differentiability_infos) - aten_declarations = list(sorted(aten_declarations, key=lambda decl: decl['name'])) - match_declarations_with_differentiability_info(aten_declarations, differentiability_infos) - - gen_variable_type_shard(out, aten_declarations, template_path, None, True) + fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) + gen_variable_type_shard(fm, fns_with_infos, 'VariableType.h', 'VariableType.h') # NOTE: see Note [Sharded File] at the top of the VariableType.cpp # template regarding sharding of the generated files. num_shards = 5 - shards = [[] for _ in range(num_shards)] + shards: List[List[NativeFunctionWithDifferentiabilityInfo]] = [[] for _ in range(num_shards)] # functions are assigned arbitrarily but stably to a file based on hash - for decl in aten_declarations: - x = sum(ord(c) for c in decl['name']) % num_shards - shards[x].append(decl) + for fn in fns_with_infos: + x = sum(ord(c) for c in cpp.name(fn.func.func)) % num_shards + shards[x].append(fn) for i, shard in enumerate(shards): - gen_variable_type_shard(out, shard, template_path, '_%d' % i, False) - gen_variable_type_shard(out, aten_declarations, template_path, 'Everything', False) - - -def gen_variable_type_shard(out, aten_declarations, template_path, suffix, header): - VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h') - VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp') + gen_variable_type_shard(fm, shard, 'VariableType.cpp', f'VariableType_{i}.cpp') - type_declarations = [] - type_definitions = [] - wrapper_registrations = [] + gen_variable_type_shard(fm, fns_with_infos, 'VariableType.cpp', 'VariableTypeEverything.cpp') - for declaration in aten_declarations: - if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']: - formals = declaration['schema_order_formals'] - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - formals = declaration['formals'] - type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals)) - strategy = dispatch_strategy(declaration) - if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived': - body = emit_body(declaration) +@with_native_function +def gen_formals(f: NativeFunction) -> str: + if f.use_c10_dispatcher.dispatcher_uses_new_style(): + formals = ', '.join( + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ) + else: + sig_group = CppSignatureGroup.from_native_function(f, method=False) + formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments()) + return formals +@with_native_function +def gen_wrapper_registration(f: NativeFunction) -> str: + if f.use_c10_dispatcher.dispatcher_uses_new_style(): + return WRAPPER_REGISTRATION.substitute( + unqual_operator_name_with_overload=f.func.name, + type_wrapper_name=type_wrapper_name(f), + class_type='VariableType', + ) + else: + return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( + unqual_operator_name_with_overload=f.func.name, + type_wrapper_name=type_wrapper_name(f), + class_type='VariableType', + ) + +def gen_variable_type_shard( + fm: FileManager, + fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo], + template_name: str, + output_name: str, +) -> None: + type_declarations: List[str] = [] + type_definitions: List[str] = [] + wrapper_registrations: List[str] = [] + + for fn in fns_with_infos: + f = fn.func + name = cpp.name(f.func) + formals = gen_formals(f) + + type_declarations.append(METHOD_DECLARATION.substitute( + return_type=cpp.returns_type(f.func.returns), + type_wrapper_name=type_wrapper_name(f), + formals=formals, + )) + + if name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == 'use_derived': type_definitions.append(METHOD_DEFINITION.substitute( - declaration, type_definition_body=body, formals=formals)) - if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']: - wrapper_registrations.append(WRAPPER_REGISTRATION.substitute( - declaration, class_type='VariableType')) - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - wrapper_registrations.append(UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( - declaration, class_type='VariableType')) + return_type=cpp.returns_type(f.func.returns), + type_wrapper_name=type_wrapper_name(f), + type_definition_body=emit_body(fn), + formals=formals, + )) + wrapper_registrations.append(gen_wrapper_registration(f)) # See Note [Manual Backend kernels] - assert (declaration['name'] in MANUAL_BACKEND) == declaration['manual_kernel_registration'] + assert (name in MANUAL_BACKEND) == f.manual_kernel_registration # If you want to register a kernel to Autograd, you must make the op abstract. # In other words, this op must have dispatch section in native_functions.yaml. - if declaration['name'] in MANUAL_AUTOGRAD_AND_TRACER or declaration['derivative']: - msg = (f'There\'s a formula for {declaration["name"]}(or its functional variant) in derivatives.yaml. ' + if name in MANUAL_AUTOGRAD_AND_TRACER or (fn.info and fn.info.has_derivatives): + msg = (f'There\'s a formula for {name}(or its functional variant) in derivatives.yaml. ' f'It\'s required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA ' f'or DefaultBackend in native_functions.yaml. Please see ' f'https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword ' f'for instructions to choose the right dispatch keyword.') - assert declaration['abstract'], msg + assert f.is_abstract, msg - env = { + fm.write_with_template(output_name, template_name, lambda: { + 'generated_comment': '@' + f'generated from {fm.template_dir}/{template_name}', 'type_derived_method_declarations': type_declarations, 'type_derived_method_definitions': type_definitions, 'wrapper_registrations': wrapper_registrations, - } - if header: - write(out, 'VariableType.h', VARIABLE_TYPE_H, env) - else: - write(out, 'VariableType%s.cpp' % suffix, VARIABLE_TYPE_CPP, env) - - -def emit_body(declaration): - assert dispatch_strategy(declaration) == 'use_derived' - - arguments = declaration['arguments'] - returns = declaration['returns'] - func = declaration['derivative'] - name = declaration['name'] - inplace = declaration['inplace'] - is_out_fn = name.endswith('_out') - modifies_arguments = inplace or is_out_fn - returns_void = len(returns) == 0 - - base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name + }) + +def emit_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]: + assert dispatch_strategy(fn) == 'use_derived' + f = fn.func + info = fn.info + + name = cpp.name(f.func) + inplace = f.func.kind() == SchemaKind.inplace + is_out_fn = f.func.kind() == SchemaKind.out + returns_void = len(f.func.returns) == 0 + base_name = f.func.name.name.base # TODO: should be str(f.func.name.name)? view_info = VIEW_FUNCTIONS.get(base_name, None) if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT: view_info = "self" - def is_differentiable(arg): - if 'TensorOptions' in arg['type']: - return False - if 'Tensor' not in arg['type']: - return False - if arg['name'] in declaration.get('non_differentiable_arg_names', []): - return False - return True - - def find_args_with_derivatives(differentiable_inputs): + def is_differentiable(name: str, type: Type) -> bool: + return type.is_tensor_like() and (info is None or name not in info.non_differentiable_arg_names) + + def gen_differentiable_input( + arg: Union[Argument, SelfArgument, TensorOptionsArguments] + ) -> Optional[DifferentiableInput]: + if isinstance(arg, TensorOptionsArguments): + return None + a: Argument = arg.argument if isinstance(arg, SelfArgument) else arg + + # TODO: `cpp_type` is only to keep it byte-for-byte compatible with the old codegen, should remove. + # NB: This is not a clone of cpp.argument() - TensorOptionsArguments / faithful / binds are + # not handled properly as they are irrelevant for this codegen. + cpp_type = cpp.argument_type(a, binds=a.name).cpp_type() + + if not is_differentiable(a.name, a.type): + return None + return DifferentiableInput( + name=a.name, + type=a.type, + cpp_type=cpp_type, + ) + + @with_native_function + def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]: + return list(mapMaybe(gen_differentiable_input, f.func.arguments.non_out)) + + def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) -> List[DifferentiableInput]: """Find arguments that have derivative definitions""" - if func is None: + if info is None or not info.has_derivatives: return differentiable_inputs - names = set(name for d in func.derivatives for name in d.var_names) - differentiable = [arg for arg in differentiable_inputs if arg['name'] in names] + names = set(name for d in info.derivatives for name in d.var_names) + differentiable = [arg for arg in differentiable_inputs if arg.name in names] if len(differentiable) != len(names): - missing = names - set(arg['name'] for arg in differentiable) - raise RuntimeError(f'Missing arguments for derivatives: {missing} in {func.name}') + missing = names - set(arg.name for arg in differentiable) + raise RuntimeError(f'Missing arguments for derivatives: {missing} in {info.name}') return differentiable - inputs = [arg for arg in arguments if not arg.get('output', False)] - differentiable_inputs = list(filter(is_differentiable, inputs)) + def gen_differentiable_outputs(f: NativeFunction) -> List[DifferentiableOutput]: + outputs: List[DifferentiableOutput] = [ + DifferentiableOutput(name=name, type=ret.type, cpp_type=cpp.return_type(ret)) + for name, ret in zip(cpp.return_names(f), f.func.returns)] + + output_differentiability = info.output_differentiability if info else None + if output_differentiability is not None: + differentiable_outputs: List[DifferentiableOutput] = [] + if False in output_differentiability and f.func.kind() == SchemaKind.inplace: + raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)") + for differentiable, output in zip(output_differentiability, outputs): + if differentiable: + differentiable_outputs.append(output) + return differentiable_outputs + + candidate_differentiable_outputs = list(filter(lambda r: is_differentiable(r.name, r.type), outputs)) + + if uses_single_grad(info): + return candidate_differentiable_outputs[:1] + else: + return candidate_differentiable_outputs + + differentiable_inputs = gen_differentiable_inputs(f) args_with_derivatives = find_args_with_derivatives(differentiable_inputs) - non_differentiable_arg_names = declaration.get('non_differentiable_arg_names', []) - candidate_differentiable_outputs = list(filter(is_differentiable, returns)) - - if declaration['output_differentiability'] is not None: - differentiable_outputs = [] - output_differentiability = declaration['output_differentiability'] - if False in output_differentiability and inplace: - raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)") - for differentiable, output in zip(output_differentiability, returns): - if differentiable: - differentiable_outputs.append(output) - elif uses_single_grad(func): - differentiable_outputs = candidate_differentiable_outputs[:1] - else: - differentiable_outputs = candidate_differentiable_outputs + differentiable_outputs = gen_differentiable_outputs(f) requires_derivative = ( base_name not in DONT_REQUIRE_DERIVATIVE and name not in DONT_REQUIRE_DERIVATIVE and len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0) - if func is not None and not requires_derivative: - raise RuntimeError('ERROR: derivative ignored for {} -- specified an autograd function without derivative' - .format(name)) + if info is not None and info.has_derivatives and not requires_derivative: + raise RuntimeError(f'ERROR: derivative ignored for {name} -- specified an autograd function without derivative') - def emit_save_inputs(): - setup = [] - if func is None: + def emit_save_inputs() -> List[str]: + setup: List[str] = [] + if info is None or not info.has_derivatives: return setup - has_tensorlist_arg = any(arg.type == 'TensorList' for arg in func.args_with_derivatives) + has_tensorlist_arg = any(is_tensor_list_type(arg.type) for arg in args_with_derivatives) # We don't want to save tensors if we know that they will never be used # when computing the derivative, so we add guards to those statements def guard_for(arg: SavedAttribute) -> Optional[str]: + assert info is not None + # It's hard to determine the edge offset if we have TensorLists if has_tensorlist_arg: return None @@ -481,12 +530,12 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: # require_grad if the backward function even gets executed. I don't # have any good ideas for detecting those cases, so I simply disabled the # checks. - if 'backward' in func.name: + if 'backward' in info.name: return None # If there's a single derivative we could compute, we already have # a requires_grad check that is sufficient - if len(func.args_with_derivatives) <= 1: + if len(args_with_derivatives) <= 1: return None # We really only care about trimming down the amount of tensors we save @@ -495,7 +544,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: # We want to emit simple guards, so we only allow that if checking one # input is enough to determine whether we need that value - used_in = [d for d in func.derivatives if arg in d.saved_inputs] + used_in = [d for d in info.derivatives if arg in d.saved_inputs] assert len(used_in) > 0 if len(used_in) != 1: return None @@ -505,75 +554,76 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: derivative_var_name = derivative.var_names[0] # Figure out the offset of the edge that uses this variable - for edge_off, arg in enumerate(func.args_with_derivatives): - if arg.name == derivative_var_name: + for edge_off, a in enumerate(args_with_derivatives): + if a.name == derivative_var_name: break else: raise AssertionError() return f'grad_fn->should_compute_output({edge_off})' - setup.extend(save_variables(func.all_saved_inputs, False, guard_for)) - for arg in func.args_with_derivatives: - if arg.type == 'TensorList': + setup.extend(save_variables(info.all_saved_inputs, False, guard_for)) + for arg in args_with_derivatives: + if is_tensor_list_type(arg.type): setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();') return setup - def setup_derivative(differentiable_inputs): - env = {} - env['args_with_derivatives'] = [arg['name'] for arg in args_with_derivatives] - env['op'] = func.op if func is not None else 'NotImplemented' - env['op_ctor'] = '' if func is not None else '"{}"'.format(declaration['api_name']) - + def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[str]: + body: List[str] = [] if is_out_fn: # For out functions, ensure that no input or output requires grad - body = [] body.append(DECLARE_GRAD_FN.substitute(op='Node')) body.append(SETUP_NONE_REQUIRES_GRAD.substitute( base_name=base_name, - args_to_check=[arg['name'] for arg in differentiable_inputs])) + args_to_check=[arg.name for arg in differentiable_inputs])) body.append(SETUP_NONE_REQUIRES_GRAD.substitute( base_name=base_name, - args_to_check=[arg['name'] for arg in differentiable_outputs])) + args_to_check=[arg.name for arg in differentiable_outputs])) return body + op = info.op if info is not None and info.has_derivatives else 'NotImplemented' setup = [] - setup.extend(ASSIGN_GRAD_FN.substitute(env).split('\n')) + setup.extend(ASSIGN_GRAD_FN.substitute( + op=op, + op_ctor='' if info is not None and info.has_derivatives else f'"{cpp.name(f.func)}"', + args_with_derivatives=[arg.name for arg in args_with_derivatives], + ).split('\n')) setup.extend(emit_save_inputs()) - body = [] body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives)) - body.append(DECLARE_GRAD_FN.substitute(env)) + body.append(DECLARE_GRAD_FN.substitute(op=op)) body.append(SETUP_DERIVATIVE.substitute(setup=setup)) return body - def emit_check_if_in_complex_autograd_allowlist(): - body = [] + def emit_check_if_in_complex_autograd_allowlist() -> List[str]: + body: List[str] = [] if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX: return body for arg in differentiable_outputs: - name = arg['name'] - if arg['type'] == 'Tensor' or arg['type'] == 'TensorList': - body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name)) + name = arg.name + # TODO: should be `arg.type.is_tensor_like()`? + if arg.cpp_type in ['Tensor', 'TensorList', 'const c10::List> &']: + body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");') return body - def emit_check_no_requires_grad(tensor_args, args_with_derivatives): + def emit_check_no_requires_grad( + tensor_args: List[DifferentiableInput], + args_with_derivatives: List[DifferentiableInput], + ) -> List[str]: """Checks that arguments without derivatives don't require grad""" - body = [] + body: List[str] = [] for arg in tensor_args: if arg in args_with_derivatives: continue - name = arg['name'] - if name in non_differentiable_arg_names: + name = arg.name + if info and name in info.non_differentiable_arg_names: continue if name == 'output': # Double-backwards definitions sometimes take in 'input' and # 'output', but only define the derivative for input. continue - if arg['dynamic_type'] in {'IndexTensor', 'ByteTensor', 'BoolTensor'}: - continue - body.append('check_no_requires_grad({}, "{}");'.format(name, name)) + body.append(f'check_no_requires_grad({name}, "{name}");') return body def save_variables( @@ -599,7 +649,7 @@ def save_variables( expr = f'SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})' else: expr = f'SavedVariable({var}, {str(is_output).lower()})' - elif arg.type == 'TensorList': + elif arg.type in ['TensorList', 'c10::List>']: name += '_' expr = f'make_saved_variable_list({arg.name})' elif arg.type == 'IntArrayRef': @@ -613,42 +663,40 @@ def save_variables( stmts.append('}') return stmts - def emit_dispatch_call(api_name, input_base, unpacked_args): + def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str: """ Dispatch call via function in a namespace or method on Tensor.""" - if 'namespace' in declaration['method_of']: - if declaration['use_c10_dispatcher'] in ['hacky_wrapper_for_legacy_signatures', 'full']: - dispatcher_api_name = make_out_api_name_faithful(api_name) - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - dispatcher_api_name = api_name + if Variant.function in f.variants: call = CALL_DISPATCH_VIA_NAMESPACE.substitute( - api_name=dispatcher_api_name, + api_name=cpp.name( + f.func, + faithful_name_for_out_overloads=f.use_c10_dispatcher.dispatcher_uses_new_style(), + ), unpacked_args=unpacked_args) else: call = CALL_DISPATCH_VIA_METHOD.substitute( - api_name=api_name, + api_name=cpp.name(f.func), var=input_base, unpacked_method_args=unpacked_args[1:]) return call - def emit_view_lambda(): + def emit_view_lambda(unpacked_bindings: List[Binding]) -> str: """ Generate an additional lambda function to recover views in backward when as_strided is not supported. See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.""" input_base = 'input_base' replay_view_func = '' - updated_unpacked_args = [] - combined = nested_dict(env, declaration) - known_view_arg_simple_types = ['int64_t', 'int64_t?', 'bool', 'IntArrayRef'] - for arg in combined['unpacked_args']: + updated_unpacked_args: List[str] = [] + known_view_arg_simple_types: List[str] = ['int64_t', 'c10::optional', 'bool', 'IntArrayRef'] + for unpacked_binding in unpacked_bindings: + arg, arg_type = unpacked_binding.name, unpacked_binding.type if arg == 'self_': updated_unpacked_args.append(input_base) continue - arg_type = combined['unpacked_args_simple_type'][arg] if arg_type not in known_view_arg_simple_types: - raise TypeError('You are adding an {} {} argument to op {} in addition to known types: {}. ' - 'Please update the list or materialize it so that it can be closed over by value, ' - 'also add a test in pytorch/xla/test/test_operations.py where this code is exercised.' - .format(arg_type, arg, declaration['name'], ', '.join(known_view_arg_simple_types))) + known_types_str = ', '.join(known_view_arg_simple_types) + raise TypeError(f'You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: ' + f'{known_types_str}. Please update the list or materialize it so that it can be closed ' + 'over by value, also add a test in pytorch/xla/test/test_operations.py where this code ' + 'is exercised.') if arg_type == 'IntArrayRef': # It's not safe to close over IntArrayRef by value, since this is a @@ -656,7 +704,7 @@ def emit_view_lambda(): arg_vec = arg + '_vec' replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec) updated_unpacked_args.append(arg_vec) - elif arg_type == 'int64_t?': + elif arg_type == 'c10::optional': # Materialize int64_t? to int64_t arg_value = arg + '_val' replay_view_func += OPTIONAL_TO_VAL.substitute(arg=arg, val=arg_value, default='0') @@ -664,7 +712,7 @@ def emit_view_lambda(): else: updated_unpacked_args.append(arg) - replay_view_call = emit_dispatch_call(combined['api_name'], input_base, updated_unpacked_args) + replay_view_call = emit_dispatch_call(f, input_base, updated_unpacked_args) replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute( input_base=input_base, replay_view_call=replay_view_call) @@ -675,17 +723,17 @@ def emit_view_lambda(): is_view_with_metadata_change=is_view_with_metadata_change, replay_view_func=replay_view_func) - def wrap_output(return_values, var): + def wrap_output(f: NativeFunction, unpacked_bindings: List[Binding], var: str) -> str: call = '' - rhs_value = None - if 'Tensor' not in declaration['return_type']: + rhs_value: Optional[str] = None + if not any(r.type.is_tensor_like() for r in f.func.returns): rhs_value = var elif view_info is not None: # See NOTE [ Autograd View Variables ] in variable.h for details. - differentiable_output_vars = {r['name'] for r in differentiable_outputs} + differentiable_output_vars = {r.name for r in differentiable_outputs} if not isinstance(view_info, str): - raise TypeError("The view info should be a string for {}, but it is: {}".format(base_name, view_info)) + raise TypeError(f'The view info should be a string for {base_name}, but it is: {view_info}') if len(differentiable_output_vars) == 0: # no output is differentiable (.indices() for SparseTensors for example) @@ -694,49 +742,55 @@ def wrap_output(return_values, var): # Single differentiable output (Tensor or Tensor[]) return_info = differentiable_outputs[0] # We only support simple Tensor or a TensorList for functions that return views - if not return_info['dynamic_type'] in ['Tensor', 'TensorList']: - raise RuntimeError("{} that return differentiable views can only return Tensor or Tensor[]".format(base_name)) + if not is_tensor_type(return_info.type) and not is_tensor_list_type(return_info.type): + raise RuntimeError(f'{base_name} that return differentiable views can only return Tensor or Tensor[]') # Only allow rebasing of the history if we return a single Tensor # If we are in a no grad block, raise a warning # See NOTE [ View + Inplace detection ] for more details about this logic - if return_info['dynamic_type'] == 'TensorList': + if is_tensor_list_type(return_info.type): if base_name in MULTI_OUTPUT_SAFE_FUNCTIONS: - creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE" + creation_meta = 'CreationMeta::MULTI_OUTPUT_SAFE' else: - creation_meta = "CreationMeta::MULTI_OUTPUT_NODE" - call += ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, " - "/* is_fw_differentiable */ true, " - "/* creation_meta */ {});").format(view_info, var, creation_meta) - rhs_value = 'std::move({})'.format(var) + creation_meta = 'CreationMeta::MULTI_OUTPUT_NODE' + call += (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, ' + '/* is_fw_differentiable */ true, ' + f'/* creation_meta */ {creation_meta});') + rhs_value = f'std::move({var})' else: - call += emit_view_lambda() - creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE" - rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, " - "/* is_fw_differentiable */ true, " - "/* view_func */ func, /* creation_meta */ {})").format(view_info, var, creation_meta) + call += emit_view_lambda(unpacked_bindings) + creation_meta = 'GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE' + rhs_value = (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, ' + '/* is_fw_differentiable */ true, ' + f'/* view_func */ func, /* creation_meta */ {creation_meta})') else: # This could be supported but we don't need it at the moment, so keeping things simple. - raise RuntimeError("Function that return multiple differentiable output " - "when at least one of them is view is not supported.") + raise RuntimeError('Function that return multiple differentiable output ' + 'when at least one of them is view is not supported.') else: - rhs_value = 'std::move({})'.format(var) + rhs_value = f'std::move({var})' assert rhs_value is not None - call += ASSIGN_RETURN_VALUE.substitute(return_values=return_values, + call += ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f), rhs_value=rhs_value) return call - def enforce_same_tensorimpl_and_storage(env, call): - save_ptrs_stmts = [] - enforce_same_ptrs_stmts = [] - if declaration['name'] not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE: - for arg in env.get('unpacked_args', []): - simple_type = env['unpacked_args_simple_type'][arg] - if simple_type == 'TensorList': + def enforce_same_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> str: + save_ptrs_stmts: List[str] = [] + enforce_same_ptrs_stmts: List[str] = [] + if cpp.name(f.func) not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE: + for unpacked_binding in unpacked_bindings: + arg = unpacked_binding.name + noref_cpp_type = unpacked_binding.ctype.cpp_type(strip_ref=True) + if noref_cpp_type == 'TensorList': save_ptrs_stmts += [SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)] enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)] - elif simple_type == 'Tensor': + elif noref_cpp_type == 'c10::List>': + save_ptrs_stmts += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), + SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)] + enforce_same_ptrs_stmts += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), + ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)] + elif noref_cpp_type == 'Tensor': save_ptrs_stmts += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg), SAVE_TENSOR_IMPL.substitute(tensor_name=arg)] enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=arg), @@ -748,74 +802,69 @@ def enforce_same_tensorimpl_and_storage(env, call): RUN_ONLY_IN_DEBUG_MODE.substitute(statements=enforce_same_ptrs_stmts) return call - def emit_call(env, tie_return_values): - combined = nested_dict(env, declaration) + def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str: # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure # the baseType operations still dispatch to non-Variable type, even if the arguments passed # in are now Variables. # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details. - base_type_call = emit_dispatch_call(combined['api_name'], 'self_', combined['unpacked_args']) - if not modifies_arguments and not returns_void: + unpacked_args = [b.name for b in unpacked_bindings] + base_type_call = emit_dispatch_call(f, 'self_', unpacked_args) + if not modifies_arguments(f) and not returns_void: call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute( base_type_call=base_type_call) - call += wrap_output(tie_return_values, 'tmp') + call += wrap_output(f, unpacked_bindings, 'tmp') else: call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute( base_type_call=base_type_call) - call = enforce_same_tensorimpl_and_storage(env, call) + call = enforce_same_tensorimpl_and_storage(call, unpacked_bindings) return call - def emit_history(): - fn = 'rebase' if modifies_arguments and view_info is None else 'set' - output_names = [r['name'] for r in differentiable_outputs] + def emit_history() -> str: + fn = 'rebase' if modifies_arguments(f) and view_info is None else 'set' + output_names = [r.name for r in differentiable_outputs] # TODO: flatten allocates a std::vector, which could be expensive outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names) return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs) - def emit_save_outputs(): + def emit_save_outputs() -> str: if is_out_fn: # out functions don't currently support differentiation return '' - func = declaration['derivative'] - if func is not None: - stmts = save_variables(func.all_saved_outputs, True) + if info is not None and info.has_derivatives: + stmts = save_variables(info.all_saved_outputs, True) if len(stmts) == 0: return '' return CONDITIONAL.substitute(cond='grad_fn', statements=stmts) return '' - def emit_any_requires_grad(): + def emit_any_requires_grad() -> List[str]: return [SETUP_ANY_REQUIRES_GRAD.substitute( - args_with_derivatives=[arg['name'] for arg in args_with_derivatives]), ] + args_with_derivatives=[arg.name for arg in args_with_derivatives]), ] - def emit_check_inplace(): + def emit_check_inplace() -> List[str]: if not inplace: return [] - return ['check_inplace({}, _any_requires_grad);'.format(arg['name']) for arg in differentiable_outputs] + return [f'check_inplace({arg.name}, _any_requires_grad);' for arg in differentiable_outputs] - def emit_increment_version(): - if not modifies_arguments: + def emit_increment_version(f: NativeFunction) -> List[str]: + if not modifies_arguments(f): return [] - return ['increment_version({});'.format(arg['name']) for arg in returns] - - env = {} - combined = nested_dict(env, declaration) + return [f'increment_version({r});' for r in cpp.return_names(f)] - body = [] + body: List[str] = [] + unpack_args_stats, unpacked_bindings = unpack_args(f) - declare_returned_variables, tie_return_values, get_return_value = format_return_variables(declaration) - - body.extend(unpack_args(env, declaration)) + body.extend(unpack_args_stats) if requires_derivative: body.extend(emit_any_requires_grad()) body.extend(emit_check_inplace()) body.extend(setup_derivative(differentiable_inputs)) - body.append(declare_returned_variables) + body.append(declare_returned_variables(f)) - body.append(emit_call(env, tie_return_values)) - body.extend(emit_increment_version()) + body.append(emit_call(f, unpacked_bindings)) + body.extend(emit_increment_version(f)) if requires_derivative: # set_flags has to appear after version_counter, because rebase_history # requires that the counter is incremented before it is called @@ -830,57 +879,54 @@ def emit_increment_version(): assert inplace body.append('reset_grad_accumulator(self);') if not returns_void: - body.append('return {};'.format(get_return_value)) + body.append(f'return {get_return_value(f)};') return body - -def unpack_args(env, declaration): - def requires_unpack(arg): - return 'Tensor' in arg['dynamic_type'] - - body = [] - unpacked_args = [] - unpacked_args_simple_type = {} - if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']: - arguments = declaration['schema_order_arguments'] +@with_native_function +def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]: + body: List[str] = [] + unpacked_bindings: List[Binding] = [] + + if f.use_c10_dispatcher.dispatcher_uses_new_style(): + bindings = [r for a in f.func.schema_order_arguments() + for r in cpp.argument(a, + method=False, + cpp_no_default_args=set(), + faithful=False, + has_tensor_options=False)] else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - arguments = declaration['arguments'] - for i, arg in enumerate(arguments): - if not requires_unpack(arg): - unpacked_args.append(arg['name']) - unpacked_args_simple_type[arg['name']] = arg['simple_type'] - continue + sig_group = CppSignatureGroup.from_native_function(f, method=False) + bindings = list(sig_group.signature.arguments()) - dynamic_type = arg['dynamic_type'] - if 'TensorOptions' not in dynamic_type: - is_nullable = arg.get('is_nullable', False) - ref = (not is_nullable) and dynamic_type not in ['TensorList'] - suffix = '_opt' if is_nullable and dynamic_type != 'TensorList' else '' - - body.append(UNPACK_TENSOR.substitute( - arg_name=arg['name'], - arg_pos=i, - suffix=suffix, - ref='&' if ref else '', - )) - else: - # Okay, we are abusing the definition of 'unpack' here a bit, - # although it's still getting the non-variable from the variable - # (in this case via TensorOptions rather than Variable/Tensor). - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \ - "VariableKernel shouldn't take TensorOptions if the op is c10-full" - body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name'])) - - unpacked_args.append(arg['name'] + '_') - unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type'] - - env['unpacked_args'] = unpacked_args - env['unpacked_args_simple_type'] = unpacked_args_simple_type - return body + for i, binding in enumerate(bindings): + assert not isinstance(binding.argument, SelfArgument) + if isinstance(binding.argument, TensorOptionsArguments): + raise RuntimeError("VariableKernel shouldn't take TensorOptions") + is_nullable = binding.argument.type.is_nullable() + if not binding.argument.type.is_tensor_like() or is_nullable: + unpacked_bindings.append(binding) + continue -def dispatch_strategy(declaration): + is_tensor_list = is_tensor_list_type(binding.argument.type) + ref = (not is_nullable) and not is_tensor_list + suffix = '_opt' if is_nullable and not is_tensor_list else '' + body.append(UNPACK_TENSOR.substitute( + arg_name=binding.name, + arg_pos=i, + suffix=suffix, + ref='&' if ref else '', + )) + unpacked_bindings.append(Binding( + name=binding.name + '_', + ctype=binding.ctype, + argument=binding.argument, + default=binding.default, + )) + + return body, unpacked_bindings + +def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str: """How are we going to call the underlying implementation of a declaration? There are two strategies: @@ -900,7 +946,7 @@ def dispatch_strategy(declaration): get dispatched back to VariableType (which will ensure that they are differentiable.) """ - if declaration['abstract'] or declaration['derivative'] is not None: + if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives): # If the function is abstract (not implemented on at::Type), we must # call the implementation on the derived type with unpacked tensors. @@ -924,62 +970,47 @@ def dispatch_strategy(declaration): # assumption might not hold, but then you'll see gradcheck fail.) return 'use_type' -def get_decl_signature(declaration: Dict[Any, Any], use_base_variant: bool = False) -> str: - name = declaration['name'] - arguments = declaration['arguments'] - if use_base_variant: - if declaration['inplace']: - assert name.endswith('_') - name = name[:-1] - elif name.endswith('_out'): - name = name[:-4] - arguments = [arg for arg in arguments if not arg.get('output', False)] - simple_types = ', '.join(arg['simple_type'] for arg in arguments) - return f'{name}({simple_types})' +def is_tensor_type(t: Type) -> bool: + # TODO: Should handle optional here? + return t.is_tensor_like() and t.is_list_like() is None -@with_native_function -def get_func_signature(f: NativeFunction) -> str: - args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments() - types = ', '.join(python.argument_type_str(a.argument.type, simple_type=True) - if isinstance(a.argument, Argument) else 'TensorOptions' - for a in args) - return f'{cpp.name(f.func)}({types})' - -def match_declarations_with_differentiability_info( - declarations: Dict[Any, Any], +def is_tensor_list_type(t: Type) -> bool: + # TODO: Should handle optional here? + return t.is_tensor_like() and t.is_list_like() is not None + +def modifies_arguments(f: NativeFunction) -> bool: + return f.func.kind() in [SchemaKind.inplace, SchemaKind.out] + +def match_differentiability_info( + native_functions: List[NativeFunction], differentiability_infos: Sequence[DifferentiabilityInfo], -) -> None: +) -> List[NativeFunctionWithDifferentiabilityInfo]: """Sets the "derivative" key on declarations to matching autograd function In-place functions will use the out-of-place derivative definition if there is no in-place specific derivative. """ - info_by_signature = {get_func_signature(info.func): info for info in differentiability_infos} + info_by_schema = {info.func.func: info for info in differentiability_infos} + functional_info_by_signature = { + info.func.func.signature(strip_default=True): info + for info in differentiability_infos + if info.func.func.kind() == SchemaKind.functional} - def find_info(declaration: Dict[Any, Any]) -> Optional[DifferentiabilityInfo]: - signature = get_decl_signature(declaration) - if signature in info_by_signature: - return info_by_signature[signature] + def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]: + if f.func in info_by_schema: + return info_by_schema[f.func], True # if there is no exact match look for the out-of-place signature. # i.e mul() for mul_() or mul_out() - signature = get_decl_signature(declaration, use_base_variant=True) - return info_by_signature.get(signature) - - for declaration in declarations: - info = find_info(declaration) - declaration['derivative'] = info if info and info.args_with_derivatives else None - - # Currently, the '.strides()' to 'strides_or_error' replacement does not support - # 'self' derivatives of an inplace function, so we must check for this case. - if declaration['inplace'] and (info is not None): - for derivative in info.derivatives: - if 'self' in derivative.var_names: - for saved_input in derivative.saved_inputs: - assert 'strides_or_error' not in saved_input.expr, ( - "Calling '.strides()' in the 'self' derivative formula of an " - f"in-place function is not supported: {declaration['name']}") - - declaration['non_differentiable_arg_names'] = info.non_differentiable_arg_names if info else [] - declaration['output_differentiability'] = info.output_differentiability if info else None + return functional_info_by_signature.get(f.func.signature(strip_default=True)), False + + result: List[NativeFunctionWithDifferentiabilityInfo] = [] + for f in native_functions: + info, is_exact_match = find_info(f) + result.append(NativeFunctionWithDifferentiabilityInfo( + func=f, + info=info, + )) + + return result diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py index bc2de6bb14d7..d5c742bb6fa5 100644 --- a/tools/autograd/load_derivatives.py +++ b/tools/autograd/load_derivatives.py @@ -62,7 +62,7 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque @with_native_function def cpp_arguments(f: NativeFunction) -> Sequence[Binding]: - return CppSignatureGroup.from_schema(f.func, method=False).signature.arguments() + return CppSignatureGroup.from_native_function(f, method=False).signature.arguments() def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ...]) -> Derivative: arguments = cpp_arguments(f) diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h index 03240e2a5a2b..0540bb65b33b 100644 --- a/tools/autograd/templates/Functions.h +++ b/tools/autograd/templates/Functions.h @@ -32,6 +32,15 @@ inline std::vector unpack_list(at::ArrayRef xs) { }); } +inline c10::List> unpack_opt_list(at::ArrayRef xs) { + torch::List> result; + result.reserve(xs.size()); + for (const SavedVariable& v : xs) { + result.push_back(v.unpack()); + } + return result; +} + struct TypeAndSize { TypeAndSize() : options(at::TensorOptions()) {} /* implicit */ diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h index 9062a4d08e34..fc8ffa5799c1 100644 --- a/tools/autograd/templates/VariableType.h +++ b/tools/autograd/templates/VariableType.h @@ -49,7 +49,6 @@ namespace VariableType { at::Tensor & unpack(Tensor & t, const char * name, int pos); const at::Tensor & unpack(const Tensor & t, const char * name, int pos); at::Tensor unpack_opt(const Tensor & t, const char * name, int pos); - c10::optional unpack_opt(const c10::optional & t, const char * name, int pos); std::vector unpack(at::TensorList tl, const char *name, int pos); }; diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp index 49be92d30d35..a77547a6cc07 100644 --- a/tools/autograd/templates/python_fft_functions.cpp +++ b/tools/autograd/templates/python_fft_functions.cpp @@ -8,6 +8,7 @@ #include "torch/csrc/autograd/utils/wrap_outputs.h" #include "torch/csrc/autograd/utils/python_arg_parsing.h" #include "torch/csrc/autograd/generated/variable_factories.h" +#include "torch/csrc/utils/out_types.h" #include "torch/csrc/utils/pycfunction_helpers.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/structseq.h" @@ -30,6 +31,7 @@ using at::TensorList; using at::Dimname; using at::DimnameList; +using torch::utils::check_out_type_matches; using namespace torch::autograd::utils; namespace torch { namespace autograd { diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index e05e6fbe1975..c42a869b3a98 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -19,6 +19,7 @@ #include "torch/csrc/Dtype.h" #include "torch/csrc/DynamicTypes.h" #include "torch/csrc/Exceptions.h" +#include "torch/csrc/utils/out_types.h" #include "torch/csrc/utils/pybind.h" #include "torch/csrc/utils/pycfunction_helpers.h" #include "torch/csrc/utils/python_arg_parser.h" @@ -53,43 +54,13 @@ using at::Dimname; using at::DimnameList; using at::ArrayRef; +using torch::utils::check_out_type_matches; using namespace torch::autograd::utils; namespace torch { namespace autograd { static PyObject* THPVariableFunctionsModule = NULL; -static void check_out_type_matches(Tensor result, - ScalarType scalarType, bool scalarType_is_none, - c10::optional layout, - const Device& device, bool device_is_none) { - if (scalarType_is_none && !layout && device_is_none) { // common case - return; - } - if (!scalarType_is_none && result.scalar_type() != scalarType) { - AT_ERROR( - "dtype ", scalarType, - " does not match dtype of out parameter (", result.scalar_type(), ")"); - } - auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType; - auto device_type_arg = device_is_none ? result.device().type() : device.type(); - if (result.scalar_type() != scalarType_arg) { - AT_ERROR( - "scalar type ", scalarType_arg, - " does not match scalar type of out parameter (", result.scalar_type(), ")"); - } - if (layout && result.layout() != *layout) { - AT_ERROR( - "layout ", *layout, - " does not match layout of out parameter (", result.layout(), ")"); - } - if (result.device().type() != device_type_arg) { - AT_ERROR( - "device type ", device_type_arg, - " does not match device type of out parameter (", result.device().type(), ")"); - } -} - inline Tensor dispatch_arange(Scalar end, Tensor result) { pybind11::gil_scoped_release no_gil; return at::arange_out(result, end); diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index a214684ab29c..dc05ace7c542 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -351,6 +351,7 @@ libtorch_extra_sources = libtorch_core_jit_sources + [ "torch/csrc/jit/serialization/export_module.cpp", "torch/csrc/jit/serialization/import_legacy.cpp", "torch/csrc/utils/byte_order.cpp", + "torch/csrc/utils/out_types.cpp", ] def libtorch_sources(gencode_pattern = ":generate-code[{}]"): @@ -408,6 +409,7 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/transform_rfactor.cpp", "torch/csrc/jit/codegen/cuda/type.cpp", "torch/csrc/jit/tensorexpr/cuda_codegen.cpp", + "torch/csrc/jit/runtime/register_cuda_ops.cpp", ] libtorch_cuda_sources = libtorch_cuda_core_sources + [ @@ -503,7 +505,6 @@ libtorch_python_core_sources = [ "torch/csrc/MemoryFormat.cpp", "torch/csrc/QScheme.cpp", "torch/csrc/Module.cpp", - "torch/csrc/PtrWrapper.cpp", "torch/csrc/python_dimname.cpp", "torch/csrc/Size.cpp", "torch/csrc/Storage.cpp", diff --git a/tools/codegen/api/autograd.py b/tools/codegen/api/autograd.py index 58fb75bb7c07..6f58eea6d1ea 100644 --- a/tools/codegen/api/autograd.py +++ b/tools/codegen/api/autograd.py @@ -87,3 +87,36 @@ class DifferentiabilityInfo: # Raw data read from derivatives.yaml. output_differentiability: Optional[List[bool]] + + @property + def has_derivatives(self) -> bool: + return len(self.args_with_derivatives) > 0 + +# Represents a differentiable `Argument`. +# How is it different from the `Argument` type? +# - It's processed Arguments which are differentiable and only used in the +# context of the autograd codegen; +# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument; +@dataclass(frozen=True) +class DifferentiableInput: + name: str + type: Type + + # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. + cpp_type: str + +# Represents a differentiable `Return`. +# How it it different from the `Return` type? +# - The name in `Return` is optional. Here it is always populated using the same +# `cpp.return_names()` method. +# TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant? +# - It's processed Returns which are differentiable, in compliance with the +# `output_differentiability` field defined in derivatives.yaml (if specified), +# and are only used in the context of the autograd codegen; +@dataclass(frozen=True) +class DifferentiableOutput: + name: str + type: Type + + # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. + cpp_type: str diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index ffd9626601a0..8a1d2a5272f5 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -1,7 +1,7 @@ from tools.codegen.model import * from tools.codegen.api.types import * import tools.codegen.local as local -from typing import Optional, Sequence, Union, List +from typing import Optional, Sequence, Union, List, Set # This file describes the translation of JIT schema to the public C++ # API, which is what people use when they call functions like at::add. @@ -104,9 +104,11 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: return BaseCType("TensorList", binds) elif str(t.elem) == 'Dimname': return BaseCType("DimnameList", binds) - # TODO: do something reasonable about lists of optional tensors - elif (not local.use_c10_dispatcher().dispatcher_uses_new_style()) and str(t.elem) == 'Tensor?': - return BaseCType("TensorList", binds) + elif str(t.elem) == 'Tensor?': + if local.use_c10_dispatcher().dispatcher_uses_new_style(): + return ConstRefCType(BaseCType("c10::List>", binds)) + else: + return BaseCType("TensorList", binds) elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) # TODO: explicitly qualify namespace here return BaseCType(f"ArrayRef<{elem.cpp_type()}>", binds) @@ -237,26 +239,37 @@ def default_expr(d: str, t: Type) -> str: def argument( a: Union[Argument, TensorOptionsArguments, SelfArgument], - *, method: bool = False, faithful: bool = False, - has_tensor_options: bool = False + *, cpp_no_default_args: Set[str], method: bool, faithful: bool, + has_tensor_options: bool ) -> List[Binding]: + def sub_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Binding]: + return argument( + a, cpp_no_default_args=cpp_no_default_args, method=method, faithful=faithful, + has_tensor_options=has_tensor_options) + if isinstance(a, Argument): binds: ArgName if a.name == "memory_format" and has_tensor_options: binds = SpecialArgName.possibly_redundant_memory_format else: binds = a.name + default: Optional[str] = None + if a.name not in cpp_no_default_args and a.default is not None: + default = default_expr(a.default, a.type) return [Binding( ctype=argument_type(a, binds=binds), name=a.name, - default=default_expr(a.default, a.type) if a.default is not None else None, + default=default, argument=a, )] elif isinstance(a, TensorOptionsArguments): if faithful: - return argument(a.dtype) + argument(a.layout) + argument(a.device) + argument(a.pin_memory) + return sub_argument(a.dtype) + sub_argument(a.layout) + \ + sub_argument(a.device) + sub_argument(a.pin_memory) else: default = None + # Enforced by NativeFunction.__post_init__ + assert 'options' not in cpp_no_default_args if all(x.default == "None" for x in a.all()): default = '{}' elif a.dtype.default == "long": @@ -272,13 +285,13 @@ def argument( # Caller is responsible for installing implicit this in context! return [] else: - return argument(a.argument) + return sub_argument(a.argument) else: assert_never(a) def arguments( arguments: Arguments, - *, faithful: bool, method: bool + *, faithful: bool, method: bool, cpp_no_default_args: Set[str] ) -> List[Binding]: args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] if faithful: @@ -289,5 +302,8 @@ def arguments( args.extend(arguments.non_out) return [ r.no_default() if faithful else r for a in args - for r in argument(a, faithful=faithful, method=method, has_tensor_options=arguments.tensor_options is not None) + for r in argument( + a, faithful=faithful, method=method, + has_tensor_options=arguments.tensor_options is not None, + cpp_no_default_args=cpp_no_default_args) ] diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py index 3b793527edd9..936500b560db 100644 --- a/tools/codegen/api/native.py +++ b/tools/codegen/api/native.py @@ -4,7 +4,7 @@ import tools.codegen.api.cpp as cpp from tools.codegen import local -from typing import Union, Sequence, List +from typing import Union, Sequence, List, Optional # This file describes the translation of JIT schema to the native functions API. # This looks a lot like the C++ API (which makes historical sense, because the @@ -34,7 +34,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: else: return ConstRefCType(BaseCType('Tensor', binds)) elif str(t) == 'Tensor?[]': - return BaseCType('TensorList', binds) + return BaseCType('const c10::List> &', binds) return cpp.argumenttype_type(t, mutable=mutable, binds=binds) def returns_type(rs: Sequence[Return]) -> str: @@ -43,26 +43,36 @@ def returns_type(rs: Sequence[Return]) -> str: def argument_type(a: Argument, *, binds: ArgName) -> CType: return argumenttype_type(a.type, mutable=a.is_write, binds=binds) -def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[Binding]: +def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out: bool) -> List[Binding]: + # Ideally, we NEVER default native functions. However, there are a number + # of functions that call native:: directly and rely on the defaulting + # existing. So for BC, we generate defaults for non-out variants (but not + # for out variants, where it is impossible to generate an appropriate + # default) + should_default = not is_out or local.use_c10_dispatcher() is not UseC10Dispatcher.full if isinstance(a, Argument): + default: Optional[str] = None + if should_default and a.default is not None: + default = cpp.default_expr(a.default, a.type) return [Binding( ctype=argument_type(a, binds=a.name), name=a.name, - default=cpp.default_expr(a.default, a.type) if a.default is not None else None, + default=default, argument=a, )] elif isinstance(a, SelfArgument): # Erase SelfArgument from the distinction - return argument(a.argument) + return argument(a.argument, is_out=is_out) elif isinstance(a, TensorOptionsArguments): if local.use_c10_dispatcher() in [UseC10Dispatcher.hacky_wrapper_for_legacy_signatures, UseC10Dispatcher.with_codegenerated_unboxing_wrapper]: # TODO: expunge this logic entirely default = None - if all(x.default == "None" for x in a.all()): - default = '{}' - elif a.dtype.default == "long": - default = 'at::kLong' # TODO: this is wrong + if should_default: + if all(x.default == "None" for x in a.all()): + default = '{}' + elif a.dtype.default == "long": + default = 'at::kLong' # TODO: this is wrong return [Binding( ctype=ConstRefCType(BaseCType('TensorOptions', 'options')), name='options', @@ -71,29 +81,35 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[B )] else: assert local.use_c10_dispatcher() == UseC10Dispatcher.full + default = None + if should_default: + default = '{}' + # TODO: Not sure why the arguments assigned here are for + # TensorOptionsArguments and not the constituent pieces. It seems + # to matter return [ Binding( ctype=OptionalCType(BaseCType('ScalarType', 'dtype')), name='dtype', - default='{}', + default=default, argument=a, ), Binding( ctype=OptionalCType(BaseCType('Layout', 'layout')), name='layout', - default='{}', + default=default, argument=a, ), Binding( ctype=OptionalCType(BaseCType('Device', 'device')), name='device', - default='{}', + default=default, argument=a, ), Binding( ctype=OptionalCType(BaseCType('bool', 'pin_memory')), name='pin_memory', - default='{}', + default=default, argument=a, )] else: @@ -107,4 +123,4 @@ def arguments(func: FunctionSchema) -> List[Binding]: else: args.extend(func.arguments.out) args.extend(func.arguments.non_out) - return [r for arg in args for r in argument(arg)] + return [r for arg in args for r in argument(arg, is_out=func.is_out_fn())] diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 059032869675..bc5cbb440b98 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -228,7 +228,7 @@ class PythonArgument: # Compute argument formal for python argument parsing. # Needs to be consistent with torch/csrc/utils/python_arg_parser.h. def argument_str(self, *, method: bool = False) -> str: - type_str = argument_type_str(self.type) + type_str = argument_type_str(self.type).replace('const ', '').replace(' &', '') name = self.name # s/self/input/ outside method bindings @@ -566,7 +566,7 @@ class DispatchLambdaArgumentExprs: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature: - return CppSignatureGroup.from_schema(f.func, method=method).signature + return CppSignatureGroup.from_native_function(f, method=method).signature def has_tensor_options(f: NativeFunction) -> bool: return f.func.arguments.tensor_options is not None @@ -624,10 +624,9 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str: return f'ScalarList[{size}]' if size is not None else 'ScalarList' elif str(t.elem) == 'Tensor?': if simple_type: - return 'TensorList' + return 'c10::List>' else: - # TODO: clone the old codegen behavior but does it make sense? - return 'TensorList?' + return 'const c10::List> &' elif str(t.elem) == 'Dimname': return f'DimnameList[{size}]' if size is not None else 'DimnameList' elem = argument_type_str(t.elem, simple_type=simple_type) @@ -1051,12 +1050,14 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str: return 'toDimnameListOptional' elif isinstance(t, ListType): - if str(t.elem) == 'Tensor' or str(t.elem) == 'Tensor?': + if str(t.elem) == 'Tensor': # accept and use definite size if t.size is not None: return f'tensorlist_n<{t.size}>' else: return 'tensorlist' + elif str(t.elem) == 'Tensor?': + return 'list_of_optional_tensors' elif str(t.elem) == 'Dimname': # accept definite size return 'dimnamelist' diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index 5532c35b4ed2..39fb8bef3846 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -1,6 +1,6 @@ from tools.codegen.model import * from dataclasses import dataclass -from typing import Optional, Union, Sequence, TypeVar, List +from typing import Optional, Union, Sequence, TypeVar, List, Set from enum import Enum _T = TypeVar('_T') @@ -31,14 +31,16 @@ class BaseCType: type: str name: ArgName - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: return self.type @dataclass(frozen=True) class ConstRefCType: elem: 'CType' - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: + if strip_ref: + return self.elem.cpp_type(strip_ref=strip_ref) return f'const {self.elem.cpp_type()} &' @property @@ -49,7 +51,9 @@ def name(self) -> ArgName: class MutRefCType: elem: 'CType' - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: + if strip_ref: + return self.elem.cpp_type(strip_ref=strip_ref) return f'{self.elem.cpp_type()} &' @property @@ -60,7 +64,8 @@ def name(self) -> ArgName: class OptionalCType: elem: 'CType' - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. return f'c10::optional<{self.elem.cpp_type()}>' @property @@ -128,13 +133,22 @@ class CppSignature: # (i.e. with a potential TensorOptions argument and out arguments in the front) faithful: bool + # The set of C++ arguments which should not have defaults applied to them + cpp_no_default_args: Set[str] + + # Is this a fallback C++ binding? Fallback bindings are enabled by + # manual_cpp_binding: True and are alternate, non-public API that + # lets manual C++ binding implementors access the binding that would + # have been automatically generated fallback_binding: bool = False # Return the unpacked argument structure of this signature, # discarding information about which arguments are semantically # related to each other. def arguments(self) -> Sequence[Binding]: - return cpp.arguments(self.func.arguments, faithful=self.faithful, method=self.method) + return cpp.arguments( + self.func.arguments, faithful=self.faithful, + method=self.method, cpp_no_default_args=self.cpp_no_default_args) def name(self) -> str: n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful) @@ -168,13 +182,26 @@ class CppSignatureGroup: faithful_signature: Optional[CppSignature] @staticmethod - def from_schema(func: FunctionSchema, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup': + def from_native_function(f: NativeFunction, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup': + func = f.func faithful_signature: Optional[CppSignature] if func.arguments.tensor_options is not None or len(func.arguments.out) > 0: - faithful_signature = CppSignature(func=func, faithful=True, method=method, fallback_binding=fallback_binding) + faithful_signature = CppSignature( + func=func, + faithful=True, + method=method, + fallback_binding=fallback_binding, + cpp_no_default_args=f.cpp_no_default_args + ) else: faithful_signature = None - signature = CppSignature(func=func, faithful=False, method=method, fallback_binding=fallback_binding) + signature = CppSignature( + func=func, + faithful=False, + method=method, + fallback_binding=fallback_binding, + cpp_no_default_args=f.cpp_no_default_args + ) return CppSignatureGroup( func=func, signature=signature, diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 37f4ea7cc174..8f521e6651bc 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -203,8 +203,7 @@ class RegisterSchema: @method_with_native_function def __call__(self, f: NativeFunction) -> Optional[str]: - op_name = f"aten::{f.func.name}" - if not self.selector.is_operator_selected(op_name): + if not self.selector.is_native_function_selected(f): return None return f'm.def({cpp_string(str(f.func))});\n' @@ -388,6 +387,7 @@ def gen_structured(self, g: StructuredNativeFunctions) -> List[str]: @with_native_function def gen_one(f: NativeFunction) -> Optional[str]: assert self.target is not Target.DECLARATION + assert not f.manual_kernel_registration # TODO: put this into StructuredNativeFunctions itself functional_func = g.out.func.signature() @@ -398,8 +398,7 @@ def gen_one(f: NativeFunction) -> Optional[str]: e.expr for e in translate(functional_sig.arguments(), dispatcher.arguments(functional_func), method=False) ) - op_name = f"aten::{f.func.name}" - if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name): + if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f): return None k = f.func.kind() @@ -431,7 +430,7 @@ def gen_one(f: NativeFunction) -> Optional[str]: if self.dispatch_key == 'Meta': impl_call = "" else: - impl_call = f"op.impl({out_expr}, {functional_exprs});" + impl_call = f"op.impl({functional_exprs}, {out_expr});" # For an overview of what this template code looks like, see # https://github.com/pytorch/rfcs/pull/9 @@ -454,19 +453,8 @@ def gen_one(f: NativeFunction) -> Optional[str]: elif self.target is Target.REGISTRATION: dispatcher_sig = DispatcherSignature.from_schema(f.func) - if local.use_c10_dispatcher() is UseC10Dispatcher.full: - payload = f"TORCH_FN({sig.name()})" - elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: - payload = f""" -c10::impl::hacky_wrapper_for_legacy_signatures< - {dispatcher_sig.type()}, - {len(f.func.arguments.out)} ->(TORCH_FN({sig.name()})) -""" - else: - assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper - payload = f"torch::CppFunction::makeUnboxedOnly(&{sig.name()})" - return f'm.impl("{f.func.name}", {payload});' + assert local.use_c10_dispatcher() is UseC10Dispatcher.full + return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));' else: assert_never(self.target) # Silence mypy's "Missing return statement" error @@ -487,9 +475,10 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: if self.dispatch_key not in f.dispatch: return None + if f.manual_kernel_registration: + return None - op_name = f"aten::{f.func.name}" - if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name): + if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f): return None name = native.name(f.func) @@ -589,14 +578,12 @@ class ComputeFunction: @method_with_native_function def __call__(self, f: NativeFunction) -> Optional[str]: - if f.manual_kernel_registration: - return None if Variant.function not in f.variants: return None name = cpp.name(f.func) - sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=f.manual_cpp_binding) + sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding) if self.target is Target.DECLARATION: result = f"TORCH_API {sig_group.signature.decl()};\n" @@ -650,7 +637,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]: name = cpp.name(f.func) - sig_group = CppSignatureGroup.from_schema(f.func, method=True, fallback_binding=f.manual_cpp_binding) + sig_group = CppSignatureGroup.from_native_function(f, method=True, fallback_binding=f.manual_cpp_binding) if self.target is Target.DECLARATION: result = f"{sig_group.signature.decl()} const;\n" @@ -729,17 +716,7 @@ def compute_native_function_declaration(g: Union[StructuredNativeFunctions, Nati if is_structured_dispatch_key(k): continue seen.add(n) - if f.func.is_out_fn() and local.use_c10_dispatcher() is UseC10Dispatcher.full: - # out overloads don't get default arguments because - # defaulted arguments would be before the out argument - # in the argument list and that doesn't work. - # TODO We should consider if we just want to remove - # default arguments from all at::native functions - # but that would be a larger change because we need - # to change a lot of call sites - args_str = ', '.join(a.defn() for a in args) - else: - args_str = ', '.join(a.decl() for a in args) + args_str = ', '.join(a.decl() for a in args) rs.append(f"TORCH_API {returns_type} {n}({args_str});") return rs @@ -769,7 +746,7 @@ def compute_meta_function_declaration(g: StructuredNativeFunctions) -> str: sig = g.signature() name = meta.name(g) args = native.arguments(sig) - args_str = ', '.join(a.defn() for a in args) + args_str = ', '.join(a.decl() for a in args) parent_class = g.out.structured_inherits if parent_class is None: parent_class = "at::impl::MetaBase" @@ -1032,7 +1009,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object: kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) out_arg_set = set(a.name for a in f.func.arguments.out) - sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=False) + sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False) cpp_args = sig_group.signature.arguments() arguments = [ compute_cpp_argument_yaml( @@ -1052,7 +1029,9 @@ def compute_declaration_yaml(f: NativeFunction) -> object: cpp_schema_order_types = [ # NB: method here doesn't matter - r.type for a in schema_order_jit_arguments for r in cpp.argument(a, method=False) + r.type for a in schema_order_jit_arguments + for r in cpp.argument( + a, method=False, cpp_no_default_args=set(), faithful=False, has_tensor_options=False) ] cpp_returns = cpp.returns_type(f.func.returns) @@ -1091,7 +1070,7 @@ def compute_registration_declarations(f: NativeFunction) -> str: name = dispatcher.name(f.func) returns_type = dispatcher.returns_type(f.func.returns) args = dispatcher.arguments(f.func) - args_str = ', '.join(a.defn() for a in args) + args_str = ', '.join(a.no_default().decl() for a in args) comment_data : Dict[str, str] = { 'schema': f'aten::{f.func}', # TODO: What exactly is the semantics of the 'dispatch' field? diff --git a/tools/codegen/model.py b/tools/codegen/model.py index a007e1a76f7c..9c8a0d73e815 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -137,6 +137,10 @@ class NativeFunction: # changes the semantics of set_output to call the parent class. structured_inherits: Optional[str] + # Argument names whose default should be excluded from the C++ interface. + # Intended for resolving overload ambiguities between signatures. + cpp_no_default_args: Set[str] + # Note [Abstract ATen methods] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # An abstract ATen method is one whose dispatch differs between @@ -169,9 +173,13 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': assert isinstance(funcs, str), f'not a str: {funcs}' func = FunctionSchema.parse(funcs) + cpp_no_default_args_list = e.pop('cpp_no_default_args', []) + assert isinstance(cpp_no_default_args_list, list) + cpp_no_default_args = set(cpp_no_default_args_list) + use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None) if use_c10_dispatcher_s is None: - use_c10_dispatcher = UseC10Dispatcher.with_codegenerated_unboxing_wrapper + use_c10_dispatcher = UseC10Dispatcher.full elif use_c10_dispatcher_s == 'full': use_c10_dispatcher = UseC10Dispatcher.full elif use_c10_dispatcher_s == 'hacky_wrapper_for_legacy_signatures': @@ -222,6 +230,9 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': assert raw_dispatch is None or isinstance(raw_dispatch, dict), e dispatch: Dict[str, str] = {} if raw_dispatch is not None: + assert not manual_kernel_registration, \ + "cannot specify both manual_kernel_registration and dispatch; with " \ + "manual registration, dispatch has no effect!" for ks, v in raw_dispatch.items(): if ks == '__line__': continue # not worth tracking line numbers for dispatch entries @@ -255,6 +266,7 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': dispatch=dispatch, device_guard=device_guard, loc=loc, + cpp_no_default_args=cpp_no_default_args, ) def validate_unstructured(self) -> None: @@ -290,6 +302,13 @@ def __post_init__(self) -> None: # happen assert not (self.structured and self.structured_delegate), \ "Cannot have both structured and structured_delegate on function" + defaulted_arguments = {a.name for a in self.func.schema_order_arguments() + if a.default is not None} + invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments) + assert len(invalid_args) == 0, f'Invalid cpp_no_default_args: {invalid_args}' + if self.structured or self.structured_delegate: + assert self.use_c10_dispatcher is UseC10Dispatcher.full, \ + "Structured kernels MUST be use_c10_dispatcher: full; port your argument order" SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out')) @@ -548,7 +567,7 @@ def kind(self) -> SchemaKind: else: return SchemaKind.functional - def signature(self) -> 'FunctionSchema': + def signature(self, *, strip_default: bool = False) -> 'FunctionSchema': """ Certain schemas are 'related', in that they are simply inplace/out/functional versions of the same function. This method @@ -563,11 +582,13 @@ def signature(self) -> 'FunctionSchema': - Out arguments are stripped - Mutability annotations are stripped (this is sound because you cannot overload on mutability annotation) + - Return names are stripped since they are not overloadable and + some variants have return names but some not """ def strip_ret_annotation(r: Return) -> Return: return Return( - name=r.name, + name=None, type=r.type, annotation=None, ) @@ -581,7 +602,7 @@ def strip_ret_annotation(r: Return) -> Return: ), overload_name="", # stripped ), - arguments=self.arguments.signature(), + arguments=self.arguments.signature(strip_default=strip_default), returns=tuple(map(strip_ret_annotation, self.returns)), ) @@ -964,14 +985,14 @@ def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]: ret.extend(self.post_tensor_options_kwarg_only) return ret - def signature(self) -> 'Arguments': + def signature(self, *, strip_default: bool = False) -> 'Arguments': # dataclasses.replace could be used here, but it is less # type safe so for now I've opted to type everything out def strip_arg_annotation(a: Argument) -> Argument: return Argument( name=a.name, type=a.type, - default=a.default, # hmmm + default=a.default if not strip_default else None, annotation=None, ) diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py index 24e387128b6c..3e80e168d31c 100644 --- a/tools/codegen/selective_build/selector.py +++ b/tools/codegen/selective_build/selector.py @@ -3,6 +3,7 @@ from dataclasses import dataclass +from tools.codegen.model import NativeFunction from tools.codegen.selective_build.operator import * # A SelectiveBuilder holds information extracted from the selective build @@ -96,6 +97,10 @@ def is_operator_selected(self, name: str) -> bool: name = strip_operator_overload_name(name) return name in self.operators and self.operators[name].include_all_overloads + def is_native_function_selected(self, func: NativeFunction) -> bool: + op_name = op_name_from_native_function(func) + return self.is_operator_selected(op_name) + def is_operator_selected_for_training(self, name: str) -> bool: if not self.is_operator_selected(name): return False @@ -123,6 +128,10 @@ def is_operator_selected_for_training(self, name: str) -> bool: (base_op.include_all_overloads and base_op.is_used_for_training) ) + def is_native_function_selected_for_training(self, func: NativeFunction) -> bool: + op_name = op_name_from_native_function(func) + return self.is_operator_selected_for_training(op_name) + def is_root_operator(self, name: str) -> bool: if not self.is_operator_selected(name): return False @@ -158,3 +167,9 @@ def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info) operators = merge_operator_dicts(lhs.operators, rhs.operators) return SelectiveBuilder(include_all_operators, debug_info, operators) + + +def op_name_from_native_function(f: NativeFunction) -> str: + # This was originally read from the 'operator_name_with_overload' field in the + # declaration dict, which was the part before the first '(' in 'schema_string'. + return f'aten::{f.func.name}' diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py index 267b5a3b221a..a52c109c603f 100644 --- a/tools/jit/gen_unboxing_wrappers.py +++ b/tools/jit/gen_unboxing_wrappers.py @@ -22,9 +22,10 @@ import re from itertools import groupby from functools import reduce -from ..autograd.gen_autograd import load_aten_declarations +import yaml + from ..autograd.gen_autograd import RETURNS_VIEWS_OF_INPUT -from ..autograd.utils import CodeTemplate, write, is_out_variant, op_name_with_overload +from ..autograd.utils import CodeTemplate, YamlLoader, write, is_out_variant, op_name_with_overload from tools.codegen.selective_build.selector import SelectiveBuilder # JIT has a type system of @@ -279,6 +280,66 @@ def argument_order(decl): return decl.get('jit_argument_order') or list(range(len(decl['arguments']))) +def format_return_type(returns): + if len(returns) == 0: + return 'void' + elif len(returns) == 1: + return returns[0]['type'] + else: + return_types = [r['type'] for r in returns] + return 'std::tuple<{}>'.format(','.join(return_types)) + + +def get_simple_type(arg): + simple_type = arg['type'] + simple_type = simple_type.replace(' &', '').replace('const ', '') + simple_type = simple_type.replace('Generator *', 'Generator') + + opt_match = re.match(r'c10::optional<(.+)>', simple_type) + if opt_match: + simple_type = '{}?'.format(opt_match.group(1)) + return simple_type + + +def load_aten_declarations(path): + with open(path, 'r') as f: + declarations = yaml.load(f, Loader=YamlLoader) + + # enrich declarations with additional information + selected_declarations = [] + for declaration in declarations: + if declaration.get('deprecated'): + continue + + for arg in declaration['arguments']: + arg['simple_type'] = get_simple_type(arg) + for arg in declaration['schema_order_arguments']: + arg['simple_type'] = get_simple_type(arg) + for ret in declaration['returns']: + ret['simple_type'] = get_simple_type(ret) + + declaration['formals'] = [arg['type'] + ' ' + arg['name'] + for arg in declaration['arguments']] + declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name'] + for arg in declaration['schema_order_arguments']] + declaration['args'] = [arg['name'] for arg in declaration['arguments']] + declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']] + declaration['api_name'] = declaration['name'] + if declaration.get('overload_name'): + declaration['type_wrapper_name'] = "{}_{}".format( + declaration['name'], declaration['overload_name']) + else: + declaration['type_wrapper_name'] = declaration['name'] + declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0] + declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1] + declaration['return_type'] = format_return_type(declaration['returns']) + + declaration['base_name'] = declaration['name'] + selected_declarations.append(declaration) + + return selected_declarations + + def gen_unboxing_wrappers( declarations, out, diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py index 46b3befde9f4..f1809552cd40 100644 --- a/tools/setup_helpers/cmake.py +++ b/tools/setup_helpers/cmake.py @@ -7,7 +7,6 @@ import re from subprocess import check_call, check_output import sys -import distutils import distutils.sysconfig from distutils.version import LooseVersion diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 41916b1fb77a..dd877da38106 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -345,6 +345,10 @@ def _propagate_and_assign_input_shapes( propagate: _bool ) -> Graph: ... +# Defined in torch/csrc/jit/runtime/graph_executor.h +class GraphExecutorState: + ... + # Defined in torch/torch/csrc/jit/ir/ir.h class Graph: def eraseInput(self, i: _int) -> None: ... diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi index cfcb66896ad7..15a286f2370c 100644 --- a/torch/_C/_autograd.pyi +++ b/torch/_C/_autograd.pyi @@ -25,7 +25,8 @@ class ProfilerConfig: state: ProfilerState, report_input_shapes: bool, profile_memory: bool, - with_stack: bool + with_stack: bool, + with_flops: bool ) -> None: ... ... diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index cd9a0f7d46a9..5ac2c0a8315d 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -77,6 +77,7 @@ class ReduceScatterOptions: timeout: timedelta class BarrierOptions: + device_ids: List[int] timeout: timedelta class AllToAllOptions: diff --git a/torch/__init__.py b/torch/__init__.py index 04955623ab2a..9ae1010a3ba8 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -574,6 +574,7 @@ def _assert(condition, message): import torch.futures import torch.nn import torch.nn.intrinsic +import torch.nn.quantizable import torch.nn.quantized import torch.optim import torch.optim._multi_tensor diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 9c767822b11b..4a1c36df7497 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -1026,7 +1026,6 @@ def merge_dicts(*dicts): tensor([ 0, 1, -4], dtype=torch.int8) """.format(**common_args)) -# TODO: see https://github.com/pytorch/pytorch/issues/43667 add_docstr(torch.bmm, r""" bmm(input, mat2, *, deterministic=False, out=None) -> Tensor @@ -2934,7 +2933,6 @@ def merge_dicts(*dicts): tensor([ 0., 1.]) """.format(**common_args)) -# TODO: see https://github.com/pytorch/pytorch/issues/43667 add_docstr(torch.eye, r""" eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor @@ -2944,6 +2942,8 @@ def merge_dicts(*dicts): Args: n (int): the number of rows m (int, optional): the number of columns with default being :attr:`n` + +Keyword arguments: {out} {dtype} {layout} @@ -3095,7 +3095,17 @@ def merge_dicts(*dicts): r""" flatten(input, start_dim=0, end_dim=-1) -> Tensor -Flattens a contiguous range of dims in a tensor. +Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim` +are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened. +The order of elements in :attr:`input` is unchanged. + +Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view, +or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can +be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the +flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned. + +.. note:: + Flattening a zero-dimensional tensor will return a one-dimensional view. Args: {input} @@ -4174,7 +4184,6 @@ def merge_dicts(*dicts): tensor([ 0.5724, 0.0000, -0.1208]) """.format(**common_args)) -# TODO: update kwargs formatting (see https://github.com/pytorch/pytorch/issues/43667) add_docstr(torch.linspace, r""" linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor @@ -4201,6 +4210,8 @@ def merge_dicts(*dicts): start (float): the starting value for the set of points end (float): the ending value for the set of points steps (int): size of the constructed tensor + +Keyword arguments: {out} {dtype} {layout} @@ -4537,7 +4548,6 @@ def merge_dicts(*dicts): tensor([ True, True, False, False]) """.format(**common_args)) -# TODO: update kwargs formatting (see https://github.com/pytorch/pytorch/issues/43667) add_docstr(torch.logspace, """ logspace(start, end, steps, base=10.0, *, \ out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor @@ -4568,7 +4578,9 @@ def merge_dicts(*dicts): start (float): the starting value for the set of points end (float): the ending value for the set of points steps (int): size of the constructed tensor - base (float): base of the logarithm function. Default: ``10.0``. + base (float, optional): base of the logarithm function. Default: ``10.0``. + +Keyword arguments: {out} {dtype} {layout} @@ -5469,36 +5481,15 @@ def merge_dicts(*dicts): add_docstr(torch.argmin, r""" -argmin(input) -> LongTensor +argmin(input, dim=None, keepdim=False) -> LongTensor -Returns the indices of the minimum value of all elements in the :attr:`input` tensor. +Returns the indices of the minimum value(s) of the flattened tensor or along a dimension This is the second value returned by :meth:`torch.min`. See its documentation for the exact semantics of this method. .. note:: If there are multiple minimal values then the indices of the first minimal value are returned. -Args: - {input} - -Example:: - - >>> a = torch.randn(4, 4) - >>> a - tensor([[ 0.1139, 0.2254, -0.1381, 0.3687], - [ 1.0100, -1.1975, -0.0102, -0.4732], - [-0.9240, 0.1207, -0.7506, -1.0213], - [ 1.7809, -1.2960, 0.9384, 0.1438]]) - >>> torch.argmin(a) - tensor(13) - -.. function:: argmin(input, dim, keepdim=False) -> LongTensor - -Returns the indices of the minimum values of a tensor across a dimension. - -This is the second value returned by :meth:`torch.min`. See its -documentation for the exact semantics of this method. - Args: {input} {dim} If ``None``, the argmin of the flattened input is returned. @@ -5512,8 +5503,15 @@ def merge_dicts(*dicts): [ 1.0100, -1.1975, -0.0102, -0.4732], [-0.9240, 0.1207, -0.7506, -1.0213], [ 1.7809, -1.2960, 0.9384, 0.1438]]) + >>> torch.argmin(a) + tensor(13) >>> torch.argmin(a, dim=1) tensor([ 2, 1, 3, 1]) + >>> torch.argmin(a, dim=1, keepdim=True) + tensor([[2], + [1], + [3], + [1]]) """.format(**single_dim_common)) add_docstr(torch.mm, @@ -6328,7 +6326,6 @@ def merge_dicts(*dicts): """.format(**common_args)) -# TODO: see https://github.com/pytorch/pytorch/issues/43667 add_docstr(torch.ones, r""" ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor @@ -6339,6 +6336,8 @@ def merge_dicts(*dicts): Args: size (int...): a sequence of integers defining the shape of the output tensor. Can be a variable number of arguments or a collection like a list or tuple. + +Keyword arguments: {out} {dtype} {layout} @@ -6356,7 +6355,6 @@ def merge_dicts(*dicts): """.format(**factory_common_args)) -# TODO: see https://github.com/pytorch/pytorch/issues/43667 add_docstr(torch.ones_like, r""" ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor @@ -6372,6 +6370,8 @@ def merge_dicts(*dicts): Args: {input} + +Keyword arguments: {dtype} {layout} {device} @@ -8260,7 +8260,7 @@ def merge_dicts(*dicts): Args: input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more batch dimensions consisting of symmetric matrices. - eigenvectors(boolean, optional): controls whether eigenvectors have to be computed + eigenvectors(bool, optional): controls whether eigenvectors have to be computed upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region Keyword args: @@ -9270,7 +9270,7 @@ def merge_dicts(*dicts): add_docstr(torch.full_like, """ -full_like(input, fill_value, \\*, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \ +full_like(input, fill_value, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \ memory_format=torch.preserve_format) -> Tensor Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`. @@ -9489,9 +9489,10 @@ def merge_dicts(*dicts): Batched version for complex inputs is only supported on the CPU. Arguments: - input (Tensor): The input tensor of size :math:`(*, m, n)` where :math:`*` is zero or more batch dimensions - rcond (float): A floating point value to determine the cutoff for small singular values. - Default: 1e-15 + input (Tensor): The input tensor of size :math:`(*, m, n)` where :math:`*` is + zero or more batch dimensions. + rcond (float, optional): A floating point value to determine the cutoff for + small singular values. Default: ``1e-15``. Returns: The pseudo-inverse of :attr:`input` of dimensions :math:`(*, n, m)` @@ -9887,6 +9888,8 @@ def merge_dicts(*dicts): Arguments: y (Tensor): The values of the function to integrate + +Keyword args: dx (float): The distance between points at which `y` is sampled. dim (int): The dimension along which to integrate. By default, use the last dimension. diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py index 67e2ec1a2cd9..26f32cfd9ffd 100644 --- a/torch/_vmap_internals.py +++ b/torch/_vmap_internals.py @@ -137,7 +137,7 @@ def _get_name(func: Callable): # Not all callables have __name__, in fact, only static functions/methods do. # A callable created via functools.partial or an nn.Module, to name some # examples, don't have a __name__. - fn_name = repr(func) + return repr(func) # vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors, # sends those into func, and then unwraps the output BatchedTensors. Operations diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index a5c078e84f4c..a3d0da1aef9d 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -468,7 +468,8 @@ def config(self): self.profiler_kind, self.record_shapes, self.profile_memory, - self.with_stack) + self.with_stack, + self.with_flops) def __enter__(self): if not self.enabled: @@ -746,6 +747,7 @@ def __enter__(self): torch.autograd.ProfilerState.NVTX, self.record_shapes, False, + False, False) ) return self diff --git a/torch/contrib/_tensorboard_vis.py b/torch/contrib/_tensorboard_vis.py index b3039f4cdd4f..b1b8d35a511d 100644 --- a/torch/contrib/_tensorboard_vis.py +++ b/torch/contrib/_tensorboard_vis.py @@ -1,6 +1,7 @@ import time from collections import defaultdict from functools import partial +from typing import DefaultDict import torch @@ -104,7 +105,7 @@ def inline_graph(subgraph, name, node): for out, val in zip(subgraph.outputs(), node.outputs()): value_map[val.unique()] = rec_value_map[out.unique()] - op_id_counter = defaultdict(int) + op_id_counter: DefaultDict[str, int] = defaultdict(int) def name_for(node): kind = node.kind()[node.kind().index('::') + 2:] diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ea9812bb360e..ca999652db5c 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -726,7 +726,6 @@ PyObject* initModule() { methods.data() }; ASSERT_TRUE(module = PyModule_Create(&torchmodule)); - ASSERT_TRUE(THPWrapper_init(module)); ASSERT_TRUE(THPGenerator_init(module)); ASSERT_TRUE(THPException_init(module)); THPSize_init(module); diff --git a/torch/csrc/PtrWrapper.cpp b/torch/csrc/PtrWrapper.cpp deleted file mode 100644 index aa48c49949b9..000000000000 --- a/torch/csrc/PtrWrapper.cpp +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include - -static PyObject* THPWrapperClass = nullptr; - -struct THPWrapper { - PyObject_HEAD - void *data; - void (*destructor)(void*); -}; - -PyObject * THPWrapper_New(void *data, void (*destructor)(void*)) -{ - PyObject *args = PyTuple_New(0); - if (!args) { - return nullptr; - } - PyObject *result = PyObject_Call(THPWrapperClass, args, nullptr); - if (result) { - THPWrapper* wrapper = (THPWrapper*) result; - wrapper->data = data; - wrapper->destructor = destructor; - } - Py_DECREF(args); - return result; -} - -bool THPWrapper_check(PyObject * obj) -{ - return (PyObject*)Py_TYPE(obj) == THPWrapperClass; -} - -void * THPWrapper_get(PyObject * obj) -{ - return ((THPWrapper*)obj)->data; -} - -static PyObject * THPWrapper_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs) -{ - PyObject* self = type->tp_alloc(type, 0); - THPWrapper* wrapper = (THPWrapper*) self; - wrapper->data = nullptr; - wrapper->destructor = nullptr; - return self; -} - -static void THPWrapper_dealloc(THPWrapper* self) -{ - self->destructor(self->data); - Py_TYPE(self)->tp_free((PyObject*)self); -} - -PyTypeObject THPWrapperType = { - PyVarObject_HEAD_INIT(nullptr, 0) - "torch._C._PtrWrapper", /* tp_name */ - sizeof(THPWrapper), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor)THPWrapper_dealloc, /* tp_dealloc */ - 0, /* tp_vectorcall_offset */ - nullptr, /* tp_getattr */ - nullptr, /* tp_setattr */ - nullptr, /* tp_reserved */ - nullptr, /* tp_repr */ - nullptr, /* tp_as_number */ - nullptr, /* tp_as_sequence */ - nullptr, /* tp_as_mapping */ - nullptr, /* tp_hash */ - nullptr, /* tp_call */ - nullptr, /* tp_str */ - nullptr, /* tp_getattro */ - nullptr, /* tp_setattro */ - nullptr, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - nullptr, /* tp_doc */ - nullptr, /* tp_traverse */ - nullptr, /* tp_clear */ - nullptr, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - nullptr, /* tp_iter */ - nullptr, /* tp_iternext */ - nullptr, /* tp_methods */ - nullptr, /* tp_members */ - nullptr, /* tp_getset */ - nullptr, /* tp_base */ - nullptr, /* tp_dict */ - nullptr, /* tp_descr_get */ - nullptr, /* tp_descr_set */ - 0, /* tp_dictoffset */ - nullptr, /* tp_init */ - nullptr, /* tp_alloc */ - THPWrapper_pynew, /* tp_new */ -}; - -bool THPWrapper_init(PyObject *module) -{ - THPWrapperClass = (PyObject*)&THPWrapperType; - if (PyType_Ready(&THPWrapperType) < 0) - return false; - Py_INCREF(&THPWrapperType); - return true; -} diff --git a/torch/csrc/PtrWrapper.h b/torch/csrc/PtrWrapper.h deleted file mode 100644 index 985193c74c9b..000000000000 --- a/torch/csrc/PtrWrapper.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef THP_PTR_WRAPPER_H -#define THP_PTR_WRAPPER_H - -#include - -/** - * Python wrapper around arbitrary opaque C++ class - */ - -bool THPWrapper_init(PyObject *module); - -PyObject * THPWrapper_New(void *data, void (*destructor)(void*)); -void * THPWrapper_get(PyObject * obj); -bool THPWrapper_check(PyObject * obj); - -#endif diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h index edf4621765f8..26f6c06b3d20 100644 --- a/torch/csrc/THP.h +++ b/torch/csrc/THP.h @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 0121fef95155..6558295d58cb 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -160,10 +161,21 @@ std::tuple _euclidean_dist_backward(const Tensor & grad, const T x2 * ratio.sum(-2, false).unsqueeze(-1) - ratio.transpose(-2, -1).matmul(x1)}; } -Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional & p_, const Tensor & norm) { +Tensor norm_backward(const Tensor& grad, const Tensor& self, const optional & p_, const Tensor& norm) { + return norm_backward(grad, self, p_, norm, {}, true); +} + +Tensor norm_backward(Tensor grad, const Tensor& self, const optional & p_, Tensor norm, IntArrayRef dim, bool keepdim) { + size_t ndim = self.sizes().size(); double p = p_.value_or(2.0).toDouble(); Tensor self_scaled; Tensor scale_v; + + if (!keepdim && self.dim() != 0) { + grad = unsqueeze_multiple(grad, dim, ndim); + norm = unsqueeze_multiple(norm, dim, ndim); + } + if (p == 0.0) { return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); } else if (p == 1.0) { @@ -172,8 +184,13 @@ Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional & p_, Tensor norm, IntArrayRef dim, bool keepdim) { - IntArrayRef sizes = self.sizes(); - if (!keepdim && self.dim() != 0) { - if (dim.size()==1) { - grad = grad.unsqueeze(dim[0]); - norm = norm.unsqueeze(dim[0]); - } else { - auto dims_to_unsqueeze = at::dim_list_to_bitset(dim, sizes.size()); - for (size_t i = 0; i < sizes.size(); i++){ - if (dims_to_unsqueeze[i]) { - grad = grad.unsqueeze(i); - norm = norm.unsqueeze(i); - } - } - } - } - return norm_backward(grad, self, p_, norm); -} - -Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) { - auto exponent = (exponent_.isComplex()) ? exponent_.toComplexDouble() : exponent_.toDouble(); - if (exponent == 0.0) { +Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent) { + if (exponent.equal(0.0)) { return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); } else { - auto out = grad * (exponent * self.pow(exponent - 1)).conj(); + auto grad_lambda = [&](auto exp) { return grad * (exp * self.pow(exp - 1)).conj(); }; + Tensor out = (exponent.isComplex()) ? grad_lambda(exponent.toComplexDouble()) : grad_lambda(exponent.toDouble()); return handle_r_to_c(self, out); } } @@ -243,9 +241,8 @@ Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& expo } Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) { - auto base_ = base.isComplex() ? base.toComplexDouble() : base.toDouble(); - auto grad_lambda = [](auto a, auto b) { return (a * std::log(b)).conj(); }; - if (base_ == 0.0) { + auto grad_lambda = [](Tensor a, Scalar b) { return (a * b.log()).conj(); }; + if (base.equal(0.0)) { auto cond = [](auto exp) { if (exp.is_complex()) { return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0); @@ -255,10 +252,10 @@ Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exp }; auto out = grad * at::where(cond(exponent), at::zeros({}, grad.options()), - grad_lambda(result, base_)); + grad_lambda(result, base)); return handle_r_to_c(exponent, out); } else { - auto out = grad * grad_lambda(result, base_); + auto out = grad * grad_lambda(result, base); return handle_r_to_c(exponent, out); } } @@ -2215,15 +2212,17 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) return nonsingular_case_backward(grad, self, det); } } else { - auto nonzero_det_indices = at::where(det); + auto nonzero_det_indices = at::native::toListOfOptionalTensors(at::where(det)); + c10::optional first_nonzero_det_index = nonzero_det_indices[0]; - if (nonzero_det_indices[0].size(0) == det.numel()) { // all determinants are nonzero (non-singular) + if (first_nonzero_det_index->size(0) == det.numel()) { // all determinants are nonzero (non-singular) return nonsingular_case_backward(grad, self, det); } - auto zero_det_indices = at::where(det == 0); + auto zero_det_indices = at::native::toListOfOptionalTensors(at::where(det == 0)); + c10::optional first_zero_det_index = zero_det_indices[0]; - if (zero_det_indices[0].size(0) == det.numel()) { // all determinants are zero (singular) + if (first_zero_det_index->size(0) == det.numel()) { // all determinants are zero (singular) return singular_case_backward(grad, self, det); } @@ -2265,15 +2264,17 @@ Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& lo return singular_case_backward(grad, self); } } else { - auto finite_logdet_indices = at::where(logdet != -INFINITY); + auto finite_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet != -INFINITY)); + c10::optional first_finite_logdet_index = finite_logdet_indices[0]; - if (finite_logdet_indices[0].size(0) == logdet.numel()) { // all log determinants are finite (non-singular) + if (first_finite_logdet_index->size(0) == logdet.numel()) { // all log determinants are finite (non-singular) return nonsingular_case_backward(grad, self); } - auto neginf_logdet_indices = at::where(logdet == -INFINITY); + auto neginf_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet == -INFINITY)); + c10::optional first_neginf_logdet_index = neginf_logdet_indices[0]; - if (neginf_logdet_indices[0].size(0) == logdet.numel()) { // all log determinants are -inf (singular) + if (first_neginf_logdet_index->size(0) == logdet.numel()) { // all log determinants are -inf (singular) return singular_case_backward(grad, self); } @@ -2317,15 +2318,17 @@ Tensor slogdet_backward(const Tensor& grad_logabsdet, return nonsingular_case_backward(grad_logabsdet, self); } } else { - auto nonzero_signdet_indices = at::where(signdet); + auto nonzero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet)); + c10::optional first_nonzero_signdet_index = nonzero_signdet_indices[0]; - if (nonzero_signdet_indices[0].size(0) == logabsdet.numel()) { // all log determinants are finite (non-singular) + if (first_nonzero_signdet_index->size(0) == logabsdet.numel()) { // all log determinants are finite (non-singular) return nonsingular_case_backward(grad_logabsdet, self); } - auto zero_signdet_indices = at::where(signdet == 0); + auto zero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet == 0)); + c10::optional first_zero_signdet_index = zero_signdet_indices[0]; - if (zero_signdet_indices[0].size(0) == logabsdet.numel()) { // all log determinants are -inf (singular) + if (first_zero_signdet_index->size(0) == logabsdet.numel()) { // all log determinants are -inf (singular) return singular_case_backward(grad_logabsdet, self); } @@ -2877,8 +2880,8 @@ Tensor embedding_dense_double_backward(const Tensor & grad, const Tensor & indic return gg_weight.view(size); } -Tensor index_backward(Tensor zeros_like_self, TensorList indices, const Tensor& grad) { - return at::_index_put_impl_(zeros_like_self, indices, grad, true, true); +Tensor index_backward(Tensor zeros_like_self, const torch::List>& indices, const Tensor& grad) { + return at::_index_put_impl_(zeros_like_self, indices, grad, true, true); } Tensor _cudnn_ctc_loss_backward(const Tensor& grad_out, const Tensor& loss, const Tensor& raw_grad, bool zero_infinity) { diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index 3814e8078b23..30736e13f58a 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -124,7 +124,7 @@ at::Tensor slogdet_backward(const at::Tensor& grad_logabsdet, const at::Tensor& at::Tensor log1p_backward(const at::Tensor& grad, const at::Tensor& self); at::Tensor sparse_constructor_values_backward(const at::Tensor& sparse_grad_out, const at::Tensor& indices, at::IntArrayRef values_shape); at::Tensor embedding_dense_double_backward(const at::Tensor & grad, const at::Tensor & indices, int64_t padding_idx); -at::Tensor index_backward(at::Tensor zeros_like_self, at::TensorList indices, const at::Tensor& grad); +at::Tensor index_backward(at::Tensor zeros_like_self, const torch::List>& indices, const at::Tensor& grad); at::Tensor _cudnn_ctc_loss_backward(const at::Tensor& grad_out, const at::Tensor& loss, const at::Tensor& raw_grad, bool zero_infinity); Tensor svd_backward(const std::vector &grads, const Tensor& self, diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index 0663d7f46fa8..d1f15fff3669 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -66,10 +66,6 @@ Tensor unpack_opt(const Tensor & t, const char * name, int pos) { return unpack(t, name, pos); } -c10::optional unpack_opt(const c10::optional & t, const char * name, int pos) { - return t; -} - std::vector unpack(at::TensorList tl, const char *name, int pos) { std::vector ret(tl.size()); for (size_t i = 0; i < tl.size(); ++i) { @@ -94,7 +90,7 @@ void _backward( // instead of us having to unwrap it to Tensor _gradient here. Tensor _gradient = gradient.has_value() ? *gradient : Tensor(); std::vector input_vars(inputs.begin(), inputs.end()); - torch::autograd::backward({self}, {_gradient}, std::move(keep_graph), create_graph, input_vars); + torch::autograd::backward({self}, {_gradient}, keep_graph, create_graph, input_vars); } void set_data(Tensor & self, const Tensor & new_data) { @@ -230,7 +226,6 @@ Tensor _fw_primal(const Tensor & self, int64_t level) { // We don't have an outplace copy, so this can't be generated automatically Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) { - jit::Value* output = nullptr; // TODO: once copy is exposed in Declarations.yaml we may be able to bind // it automatically auto& self_ = unpack(self, "self", 0); @@ -282,7 +277,7 @@ Tensor& resize_( } { at::AutoNonVariableTypeMode non_var_type_mode(true); - self_.resize_(size, std::move(optional_memory_format)); + self_.resize_(size, optional_memory_format); } if (self.fw_grad(/* level */ 0).defined()) { @@ -303,7 +298,7 @@ Tensor& resize_as_( } { at::AutoNonVariableTypeMode non_var_type_mode(true); - at::resize_as_(self_, the_template_, std::move(optional_memory_format)); + at::resize_as_(self_, the_template_, optional_memory_format); } // Handle fw grad diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index af02de68fc27..509a12e01140 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -266,12 +266,31 @@ inline void check_no_requires_grad(TensorList tensors, const char* name) { } } +inline void check_no_requires_grad(const c10::List>& tensors, const char* name) { + for (c10::optional tensor : tensors) { + if (tensor.has_value()) { + check_no_requires_grad(*tensor, name); + } + } +} + // Assumed that saved tensor lists are never inplace outputs inline std::vector make_saved_variable_list(TensorList tensors) { return fmap(tensors, [](const Tensor& tensor) -> SavedVariable { return SavedVariable{tensor, false /* is output */}; }); } +// Assumed that saved tensor lists are never inplace outputs +inline std::vector make_saved_variable_list(const c10::List>& tensors) { + return fmap(tensors, [](const c10::optional& tensor) -> SavedVariable { + if (tensor.has_value()) { + return SavedVariable{*tensor, false /* is output */}; + } else { + return SavedVariable{Tensor(), false /* is output */}; + } + }); +} + inline std::vector> to_args_sizes(TensorList tensors) { std::vector> args_sizes(tensors.size()); for (size_t i = 0; i < tensors.size(); ++i) { diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index e8d426fd768e..975f1bf954a0 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -60,7 +60,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("CUDA", ActivityType::CUDA); py::class_(m, "ProfilerConfig") - .def(py::init()); + .def(py::init()); py::class_(m, "ProfilerEvent") .def("kind", &LegacyEvent::kindStr) diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index 3b1d254e985b..85272677a06b 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -226,8 +226,10 @@ void ProfilerThreadLocalState::pushRange( evt.setSequenceNr(fn.seqNr()); evt.setFwdThreadId(fn.forwardThreadId()); evt.setScope((uint8_t)fn.scope()); - evt.setExtraArgs(saveExtraArgs(fn)); - evt.setFlops(computeFlops(std::string(fn.name().str()), evt.extraArgs())); + if (config_.with_flops) { + evt.setExtraArgs(saveExtraArgs(fn)); + evt.setFlops(computeFlops(std::string(fn.name().str()), evt.extraArgs())); + } #ifndef C10_MOBILE // backward nodes source range corresponds to the forward node // TODO: consider using C++ stack trace diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index 3e07c8cb541b..23169cd33450 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -387,16 +387,19 @@ struct TORCH_API ProfilerConfig { ProfilerState state, bool report_input_shapes = false, bool profile_memory = false, - bool with_stack = false) + bool with_stack = false, + bool with_flops = false) : state(state), report_input_shapes(report_input_shapes), profile_memory(profile_memory), - with_stack(with_stack) {} + with_stack(with_stack), + with_flops(with_flops) {} ~ProfilerConfig() = default; ProfilerState state; bool report_input_shapes; bool profile_memory; bool with_stack; + bool with_flops; // Returns IValues corresponding to ProfilerConfig struct, to be used for // serialization. diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp index eee29481bea5..a9c7d709466e 100644 --- a/torch/csrc/autograd/python_engine.cpp +++ b/torch/csrc/autograd/python_engine.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include #include diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h index 60fa7fa7659d..41a2ccaeaedc 100644 --- a/torch/csrc/autograd/python_variable.h +++ b/torch/csrc/autograd/python_variable.h @@ -25,13 +25,17 @@ bool THPVariable_initModule(PyObject *module); THP_API PyObject * THPVariable_Wrap(torch::autograd::Variable var); static inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) { + // Check that a python object is a `Tensor`, but not a `Tensor` subclass. + // (A subclass could have different semantics.) The one exception is + // Parameter, which is used for Python bookkeeping but is equivalent to + // Tensor as far as C++ is concerned. return ( tp == (PyTypeObject*)THPVariableClass || tp == (PyTypeObject*)ParameterClass ); } -inline bool THPVariable_CheckExact(PyObject *obj) { +static inline bool THPVariable_CheckExact(PyObject *obj) { return THPVariable_CheckTypeExact(Py_TYPE(obj)); } diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp index 4b38d924c91b..285161a49ef2 100644 --- a/torch/csrc/autograd/python_variable_indexing.cpp +++ b/torch/csrc/autograd/python_variable_indexing.cpp @@ -351,6 +351,10 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) { } auto& self_ = reinterpret_cast(self)->cdata; + if (self_.is_sparse()) + { + throw TypeError("Cannot assign to a sparse tensor"); + } OptionalDeviceGuard device_guard(device_of(self_)); at::Device self_device = self_.device(); Variable value; diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index b31d44a1d295..76b466c91f10 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -345,6 +345,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::class_<::c10d::BarrierOptions>(module, "BarrierOptions") .def(py::init<>()) + .def_readwrite("device_ids", &::c10d::BarrierOptions::device_ids) .def_readwrite("timeout", &::c10d::BarrierOptions::timeout); py::class_<::c10d::AllToAllOptions>(module, "AllToAllOptions") diff --git a/torch/csrc/jit/backends/backend_detail.h b/torch/csrc/jit/backends/backend_detail.h index 2d19f2ed8950..00f0f2f9eb44 100644 --- a/torch/csrc/jit/backends/backend_detail.h +++ b/torch/csrc/jit/backends/backend_detail.h @@ -1,5 +1,6 @@ #pragma once +#include #include namespace torch { diff --git a/torch/csrc/jit/cuda/cuda.h b/torch/csrc/jit/cuda/cuda.h new file mode 100644 index 000000000000..fa92ce22d6e4 --- /dev/null +++ b/torch/csrc/jit/cuda/cuda.h @@ -0,0 +1,179 @@ +#include +#include +#include +#include + +namespace torch { +namespace jit { + +class CUDAEvent; +// This class is a wrapper around c10::cuda::CUDAStream. +// It is needed because TorchBind does not support all of the argument types +// for c10::cuda::CUDAStream. For more details, please refer to +// c10/cuda/CUDAStream.h. +class CUDAStream final : public CustomClassHolder { + public: + CUDAStream(int64_t device = -1, int64_t priority = 0) { + constexpr int64_t PRIORITY_INDEX = 0; + stream_ = std::make_unique( + c10::cuda::getStreamFromPool(priority < PRIORITY_INDEX, device)); + } + + CUDAStream(c10::cuda::CUDAStream s) { + stream_ = std::make_unique(s); + } + + bool query() { + return stream_->query(); + } + + c10::intrusive_ptr recordEvent( + c10::intrusive_ptr event); + + void synchronize() { + stream_->synchronize(); + } + + void waitEvent(c10::intrusive_ptr event); + + void waitStream(c10::intrusive_ptr stream); + + /// Get the CUDA device index that this stream is associated with. + int64_t device_index() const { + return stream_->device_index(); + } + + /// Get the full Device that this stream is associated with. The Device + /// is guaranteed to be a CUDA device. + c10::Device device() const { + return stream_->device(); + } + + /// Return the stream ID corresponding to this particular stream. + int64_t id() const { + return stream_->id(); + } + + /// Pack a CUDAStream to uint64_t representation. + /// The CUDAStream can be unpacked using unpack(). The format of + /// the uint64_t is unspecified and may be changed. + int64_t pack() const { + return stream_->pack(); + } + + private: + std::unique_ptr stream_; + friend class CUDAEvent; +}; + +// This class is a wrapper around at::cuda::CUDAStream. +// It is needed because TorchBind does not support all of the argument types +// for at::cuda::CUDAEvent. For more details, please refer to +// aten/src/ATen/cuda/CUDAEvent.h. +class CUDAEvent final : public CustomClassHolder { + public: + CUDAEvent( + bool enable_timing = false, + bool blocking = false, + bool interprocess = false) { + int flags = cudaEventDisableTiming; + if (enable_timing) { + flags = cudaEventDefault; + } + if (blocking) { + flags |= cudaEventBlockingSync; + } + if (interprocess) { + TORCH_CHECK(!enable_timing); + flags |= cudaEventInterprocess; + } + + event_ = std::make_unique(flags); + } + + double elapsedTime(c10::intrusive_ptr end) { + return event_->elapsed_time(*end->event_); + } + + std::string ipcHandle() { + cudaIpcEventHandle_t handle; + event_->ipc_handle(&handle); + std::string str_handle((const char*)&handle, sizeof(handle)); + return str_handle; + } + + bool query() { + return event_->query(); + } + + void record(c10::intrusive_ptr stream); + + void synchronize() { + event_->synchronize(); + } + void wait(c10::intrusive_ptr stream); + + private: + void recordInternal(CUDAStream* stream); + std::unique_ptr event_; + + friend class CUDAStream; +}; + +c10::intrusive_ptr CUDAStream::recordEvent( + c10::intrusive_ptr event) { + if (!event) { + event = c10::make_intrusive(); + } + + event->recordInternal(this); + return event; +} + +void CUDAStream::waitEvent(c10::intrusive_ptr event) { + event->event_->block(*stream_); +} + +void CUDAStream::waitStream(c10::intrusive_ptr stream) { + auto ev = c10::make_intrusive(); + stream->recordEvent(ev); + waitEvent(ev); +} + +void CUDAEvent::record(c10::intrusive_ptr stream) { + event_->record(*stream->stream_); +} + +void CUDAEvent::recordInternal(CUDAStream* stream) { + event_->record(*stream->stream_); +} + +void CUDAEvent::wait(c10::intrusive_ptr stream) { + event_->block(*stream->stream_); +} + +TORCH_LIBRARY(cuda, m) { + auto stream_class = m.class_("Stream").def( + torch::init()); + auto event_class = m.class_("Event").def( + torch::init()); + + stream_class.def("query", &CUDAStream::query) + .def("record_event", &CUDAStream::recordEvent) + .def("synchronize", &CUDAStream::synchronize) + .def("wait_event", &CUDAStream::waitEvent) + .def("wait_stream", &CUDAStream::waitStream) + .def("device_index", &CUDAStream::device_index) + .def("device", &CUDAStream::device) + .def("pack", &CUDAStream::pack) + .def("id", &CUDAStream::id); + + event_class.def("elapsed_time", &CUDAEvent::elapsedTime) + .def("query", &CUDAEvent::query) + .def("record", &CUDAEvent::record) + .def("synchronize", &CUDAEvent::synchronize) + .def("wait", &CUDAEvent::wait); +}; + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp index 8b1aa58b5aff..f4c1fa2c920d 100644 --- a/torch/csrc/jit/frontend/script_type_parser.cpp +++ b/torch/csrc/jit/frontend/script_type_parser.cpp @@ -211,6 +211,13 @@ TypePtr ScriptTypeParser::parseTypeFromExprImpl(const Expr& expr) const { } } + // Check if the type is a custom class. This is done by checking + // if type_name starts with "torch.classes." + if (type_name.find("torch.classes.") == 0) { + auto custom_class_type = getCustomClass("__torch__." + type_name); + return custom_class_type; + } + throw ErrorReport(expr) << "Unknown type name '" << type_name << "'"; } else if (auto name = parseBaseTypeName(expr)) { auto itr = string_to_type_lut().find(*name); diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp index 72ccd77f2220..1bab391bd393 100644 --- a/torch/csrc/jit/frontend/tracer.cpp +++ b/torch/csrc/jit/frontend/tracer.cpp @@ -103,6 +103,9 @@ void TracingState::delValue(const IValue& var) { Value* getValueTrace(const IValue& var) { return getTracingState()->getValue(var); } +Value* getOptTensorValueTrace(const c10::optional& var) { + return getValueTrace(IValue(var)); +} Value* TracingState::getValue(const IValue& var) { // allow tracing of tuples passed to List[Tensor] or Tuple[Tensor...] // arguments @@ -686,6 +689,16 @@ void addInputs( } n->addInput(list_node->output()); } +TORCH_API void addInputs( + Node* n, + const char* name, + const List>& value) { + Graph* g = n->owningGraph(); + Node* list_node = nullptr; + list_node = g->insertNode(g->createList( + OptionalType::ofTensor(), fmap(value, getOptTensorValueTrace))); + n->addInput(list_node->output()); +} void addInputs( Node* n, diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h index 61d79cb3efd2..f5cbd821bda4 100644 --- a/torch/csrc/jit/frontend/tracer.h +++ b/torch/csrc/jit/frontend/tracer.h @@ -255,6 +255,10 @@ TORCH_API void addInputs( const char* name, ArrayRef value, bool allow_undefined = false); +TORCH_API void addInputs( + Node* n, + const char* name, + const List>& value); TORCH_API void addInputs( Node* n, const char* name, diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp index 0b3e4a4a7b41..1ca0f48f9e17 100644 --- a/torch/csrc/jit/ir/alias_analysis.cpp +++ b/torch/csrc/jit/ir/alias_analysis.cpp @@ -572,7 +572,8 @@ void AliasDb::analyzeImpl(Node* node) { !aliasAnalysisHasSpecialCaseFor(node->kind()), "Special cases should be handled already if we're here."); - if (node->kind().is_aten() || node->kind().is_prim()) { + if (node->kind().is_aten() || node->kind().is_prim() || + node->kind().is_cuda()) { // TODO There is nothing in the system that relies on aten:: and prim:: // ops using AliasAnalysisKind::FROM_SCHEMA or // AliasAnalysisKind::INTERNAL_SPECIAL_CASE, but this is the intended diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index 65b410d82069..eb75928e5952 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -1079,6 +1079,11 @@ bool Node::hasSideEffects() const { case prim::rpc_sync: // It represents RPC message sent. case prim::rpc_remote: // It represents RPC message sent. case aten::wait: // It can represent RPC message received. +#ifndef __HIP_PLATFORM_HCC__ + case cuda::set_stream: + case cuda::_set_device: + case cuda::_current_device: +#endif case prim::Enter: case prim::Exit: return true; @@ -1094,7 +1099,7 @@ bool Node::hasSideEffects() const { return false; } - if (kind_.is_prim() || kind_.is_aten()) { + if (kind_.is_prim() || kind_.is_aten() || kind_.is_cuda()) { // TODO There is nothing in the system that relies on aten:: and prim:: // ops using AliasAnalysisKind::FROM_SCHEMA, // AliasAnalysisKind::INTERNAL_SPECIAL_CASE, or diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index 21f172f01465..02867b8639cd 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -72,6 +72,11 @@ using namespace ::c10::attr; namespace aten { using namespace ::c10::aten; } +namespace cuda { +#ifndef __HIP_PLATFORM_HCC__ +using namespace ::c10::cuda; +#endif +} // namespace cuda struct Function; struct MatchedSchema; diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h index 8b7da739df9a..2be75c61b6b5 100644 --- a/torch/csrc/jit/mobile/module.h +++ b/torch/csrc/jit/mobile/module.h @@ -1,5 +1,6 @@ #pragma once //#include +#include #include #include diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp index 162e596eb6a7..bc26183a25bb 100644 --- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp +++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp @@ -271,6 +271,93 @@ std::vector ReshapeToAdvancedIndexingFormat( return indices; } +// Register index_put inputs/outputs through the blocks. +// Eg. The IR before updating: +// = prim::Loop(%10, %27) +// block0(%stream_idx.1 : int): +// = prim::Loop(%9, %27) +// block0(%i.1 : int): +// %36 : Tensor = aten::select(%bias.1, %26, %stream_idx.1) +// %41 : Tensor = aten::copy_(%37, %40, %25) +// -> (%27) +// -> (%27) +// After updating: +// %62 : Tensor = prim::Loop(%10, %27, %bias.2) +// block0(%stream_idx.1 : int, %bias.3 : Tensor): +// %61 : Tensor = prim::Loop(%9, %27, %bias.3) +// block0(%i.1 : int, %bias.1 : Tensor): +// %36 : Tensor = aten::select(%bias.1, %26, %stream_idx.1) +// %59 : Tensor?[] = prim::ListConstruct(%55, %58) +// %60 : Tensor = aten::index_put(%bias.1, %59, %45, %25) +// -> (%27, %60) +// -> (%27, %61) +void RegisterIndexPutInBlocks( + Value* orig_data, + Value* new_index_put, + Node* block_node, + Block* outer_block, + Node* next_node) { + auto cur_node = next_node; + while (nullptr != cur_node) { + if (cur_node->kind() != prim::Loop) + return; + cur_node = cur_node->owningBlock()->owningNode(); + } + + for (auto block_input : outer_block->inputs()) { + if (block_input->debugName() == orig_data->debugName()) { + AT_ERROR( + "More than one aten::index_put in a subblock are not supported."); + } + } + + // Register index_put outputs through the blocks. + for (auto block_output : outer_block->outputs()) { + if (block_output->debugName() == new_index_put->debugName()) + return; + } + outer_block->registerOutput(new_index_put); + std::vector> node_list = { + std::make_pair(outer_block, next_node)}; + next_node->addOutput()->copyMetadata(new_index_put); + auto next_block = next_node->owningBlock(); + while (nullptr != next_block->owningNode()) { + outer_block = next_block; + outer_block->registerOutput(next_node->output(0)); + next_node = outer_block->owningNode(); + next_node->addOutput()->copyMetadata(new_index_put); + next_block = next_node->owningBlock(); + node_list.emplace_back(std::make_pair(outer_block, next_node)); + } + + // Register index_put inputs through the blocks. + auto next_data = orig_data; + while (!node_list.empty()) { + auto cur_pair = node_list.back(); + // Add input to current node. + cur_pair.second->addInput(next_data); + // Add input to current block. + auto cur_input = cur_pair.first->addInput(); + cur_input->copyMetadata(next_data); + next_data = cur_input; + node_list.pop_back(); + } + // Update index_put inputs inside the inner most block. + auto prev_data = block_node->input(0); + for (auto node : block_node->owningBlock()->nodes()) { + size_t idx = 0; + for (auto inputs_ : node->inputs()) { + if (inputs_ == prev_data) { + node->replaceInput(idx, next_data); + idx++; + break; + } + } + } + orig_data->replaceAllUsesAfterNodeWith( + next_node->output(0)->node(), next_node->output(0)); +} + // Trace back all the slice & select nodes associated with the index_put node, // and convert them to associated indices. // E.g. The IR for x[1:3, 0] = update @@ -336,7 +423,16 @@ void SquashSliceAndSelect(Node* index_put_node) { new_index_put->copyMetadata(index_put_node->output()); index_put_node->output()->replaceAllUsesWith(new_index_put); - orig_data->replaceAllUsesAfterNodeWith(new_index_put->node(), new_index_put); + auto block_node = new_index_put->node(); + auto outer_block = block_node->owningBlock(); + auto next_node = outer_block->owningNode(); + if (nullptr == next_node) { + orig_data->replaceAllUsesAfterNodeWith( + new_index_put->node(), new_index_put); + return; + } + RegisterIndexPutInBlocks( + orig_data, new_index_put, block_node, outer_block, next_node); } void PrepareCopyForONNX(Block* block) { diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 041471bfa077..e8091957ba65 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -15,6 +15,11 @@ #include #include +// NOLINTNEXTLINE +C10_DEFINE_bool( + torch_jit_disable_cat, + false, + "disable aten::cat in TE fusion groups"); namespace torch { namespace jit { @@ -202,6 +207,10 @@ bool isSupported(Node* node) { } } + if (FLAGS_torch_jit_disable_cat && node->kind() == aten::cat) { + return false; + } + return true; } diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 933d3bb1a867..056e23d06f02 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -217,6 +217,32 @@ std::shared_ptr PythonModuleValue::attr( return toSugaredValue(member, m, loc, /*is_constant=*/true); } +#ifndef __HIP_PLATFORM_HCC__ +std::shared_ptr CUDAPythonModuleValue::attr( + const SourceRange& loc, + Function& m, + const std::string& field) { + // List of all the cuda operators which are supported in JIT + const std::unordered_set cuda_ops = {"current_stream", + "default_stream", + "_current_device", + "_set_device", + "device_index", + "device_count", + "set_stream"}; + + if (cuda_ops.find(field) != cuda_ops.end()) { + return std::make_shared(Symbol::cuda(field), c10::nullopt); + } + + py::object member = getattr(loc, field); + // note: is_constant = true because we consider that global properties + // on modules like math.pi or torch.float to be constants + // even though it is possible, though rare, for someone to mutate them + return toSugaredValue(member, m, loc, /*is_constant=*/true); +} +#endif + Value* ModuleValue::asValue(const SourceRange& loc, Function& m) { return self_; } @@ -938,6 +964,12 @@ std::shared_ptr toSugaredValue( if (auto callee = as_function(obj)) { return std::make_shared(callee->function_); } else if (py::isinstance(obj)) { +#ifndef USE_ROCM + std::string obj_name = py::cast(py::getattr(obj, "__name__")); + if (obj_name.compare("torch.cuda") == 0) { + return std::make_shared(obj); + } +#endif return std::make_shared(obj); } else if ( obj.ptr() == py::module::import("torch.jit").attr("_fork").ptr() || diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h index b5d8f4490b3e..1edbc6c15cad 100644 --- a/torch/csrc/jit/python/python_sugared_value.h +++ b/torch/csrc/jit/python/python_sugared_value.h @@ -91,6 +91,20 @@ struct VISIBILITY_HIDDEN PythonModuleValue : public PythonValue { const std::string& field) override; }; +// Used for desugaring uses of the torch.cuda module. All the CUDA APIs with +// torch.cuda.* are resolved using CUDAPythonModuleValue. +#ifndef __HIP_PLATFORM_HCC__ +struct VISIBILITY_HIDDEN CUDAPythonModuleValue : public PythonValue { + explicit CUDAPythonModuleValue(py::object mod) + : PythonValue(std::move(mod)) {} + + std::shared_ptr attr( + const SourceRange& loc, + Function& m, + const std::string& field) override; +}; +#endif + // Represents all the parameters of a module as a List[Tensor] struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue { ConstantParameterList(Value* the_list) : the_list_(the_list) {} diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h index 120a3ffb7507..a4bb209cd17e 100644 --- a/torch/csrc/jit/runtime/interpreter.h +++ b/torch/csrc/jit/runtime/interpreter.h @@ -5,6 +5,7 @@ #include #include +#include #include #include diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp new file mode 100644 index 000000000000..5cf31d626dd0 --- /dev/null +++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp @@ -0,0 +1,87 @@ +// This file registers special JIT operators used to implement the PyTorch CUDA +// API in TorchScript. +#ifndef __HIP_PLATFORM_HCC__ +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +namespace { + +c10::AliasAnalysisKind aliasAnalysisFromSchema() { + return c10::AliasAnalysisKind::FROM_SCHEMA; +} + +RegisterOperators const reg({ + Operator( + "cuda::current_stream(int64_t val) -> __torch__.torch.classes.cuda.Stream", + [](Stack* stack) { + auto idx = uint16_t(pop(stack).toInt()); + auto s = c10::cuda::getCurrentCUDAStream(idx); + auto st = make_custom_class(s); + push(stack, IValue(st)); + }, + aliasAnalysisFromSchema()), + Operator( + "cuda::default_stream(int64_t val) -> __torch__.torch.classes.cuda.Stream", + [](Stack* stack) { + auto idx = uint16_t(pop(stack).toInt()); + auto s = c10::cuda::getDefaultCUDAStream(idx); + auto st = make_custom_class(s); + push(stack, IValue(st)); + }, + aliasAnalysisFromSchema()), + Operator( + "cuda::_current_device() -> int", + [](Stack* stack) { + auto v = c10::cuda::current_device(); + push(stack, static_cast(v)); + }, + aliasAnalysisFromSchema()), + Operator( + "cuda::_set_device(int64_t val) -> ()", + [](Stack* stack) { + int64_t idx = -1; + pop(stack, idx); + c10::cuda::set_device(static_cast(idx)); + }, + aliasAnalysisFromSchema()), + Operator( + "cuda::device_index(Device device) -> int", + [](Stack* stack) { + auto device = pop(stack); + auto idx = device.toDevice().index(); + push(stack, idx); + }, + aliasAnalysisFromSchema()), + Operator( + "cuda::device_count() -> int", + [](Stack* stack) { push(stack, at::cuda::device_count()); }, + aliasAnalysisFromSchema()), + Operator( + "cuda::set_stream(__torch__.torch.classes.cuda.Stream stream) -> ()", + [](Stack* stack) { + auto v = pop(stack); + auto s = v.toCustomClass(); + // To set the current CUDA stream using + // c10::cuda::setCurrentCUDAStream, the jit::CUDAStream object needs + // to be converted to c10::cuda::CUDAStream. Since the latter cannot + // be returned from a class registered via TorchBind, this can only be + // achieved by packing the c10::cuda::CUDAStream instance contained + // inside the jit::CUDAStream object to a uint64_t representation, and + // unpacking it inside this operator. The unpacked stream is then used + // to set the current CUDA stream. + auto packed = s->pack(); + auto unpacked = c10::cuda::CUDAStream::unpack(packed); + c10::cuda::setCurrentCUDAStream(unpacked); + }, + aliasAnalysisFromSchema()), +}); +} // namespace +} // namespace jit +} // namespace torch +#endif diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index f23b09dc0e74..fe75ec52046e 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -908,7 +908,7 @@ RegisterOperators reg( TORCH_SELECTIVE_SCHEMA( "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"), [](Stack* stack) { - auto indices = pop(stack).toTensorVector(); + auto indices = pop(stack).to>>(); auto self = pop(stack).toTensor(); auto result = at::index(self, indices); push(stack, std::move(result)); @@ -921,7 +921,7 @@ RegisterOperators reg( auto unsafe = pop(stack).toBool(); auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); - auto indices = pop(stack).toTensorVector(); + auto indices = pop(stack).to>>(); auto self = pop(stack).toTensor(); auto result = at::_index_put_impl_(self, indices, values, accumulate, unsafe); @@ -934,7 +934,7 @@ RegisterOperators reg( [](Stack* stack) { auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); - auto indices = pop(stack).toTensorVector(); + auto indices = pop(stack).to>>(); auto self = pop(stack).toTensor(); auto result = at::index_put_(self, indices, values, accumulate); push(stack, std::move(result)); @@ -946,7 +946,7 @@ RegisterOperators reg( [](Stack* stack) { auto accumulate = pop(stack).toBool(); auto values = pop(stack).toTensor(); - auto indices = pop(stack).toTensorVector(); + auto indices = pop(stack).to>>(); auto self = pop(stack).toTensor(); auto result = at::index_put_(self, indices, values, accumulate); push(stack, std::move(result)); diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 11fb5dae2d6c..5c118f513565 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -88,7 +88,7 @@ REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { auto out_t = p_node->Output(0, reg).toTensor(); static_add op{out_t}; op.meta(in0_t, in1_t, in2_s); - op.impl(out_t, in0_t, in1_t, in2_s); + op.impl(in0_t, in1_t, in2_s, out_t); }; }); diff --git a/torch/csrc/jit/runtime/vararg_functions.h b/torch/csrc/jit/runtime/vararg_functions.h index 36bef721d626..d6eba7f5d191 100644 --- a/torch/csrc/jit/runtime/vararg_functions.h +++ b/torch/csrc/jit/runtime/vararg_functions.h @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace torch { diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index e203a03a2e24..c86cbc460c9c 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -1339,15 +1339,13 @@ struct PythonPrintImpl { body_ << "\"" << param << "\", "; } body_ << "]\n"; -#ifndef FBCODE_CAFFE2 - // Note: Forward compat gated. TODO: @voznesenskym to remove when ready. + indent(); body_ << "__buffers__ = ["; for (const auto& buffer : buffers) { body_ << "\"" << buffer << "\", "; } body_ << "]\n"; -#endif } for (size_t i = 0; i < numAttrs; i++) { diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp index e60a0bd704bf..186af3ca822f 100644 --- a/torch/csrc/jit/tensorexpr/eval.cpp +++ b/torch/csrc/jit/tensorexpr/eval.cpp @@ -834,8 +834,12 @@ class SimpleIREvaluatorImpl : public IRVisitor { return std::erfc(v); case kSqrt: return std::sqrt(v); - case kRsqrt: - return 1.0f / std::sqrt(v); + case kRsqrt: { + auto rsqrt = [](TInput v) __ubsan_ignore_float_divide_by_zero__ { + return 1.0f / std::sqrt(v); + }; + return rsqrt(v); + } case kCeil: return std::ceil(v); case kFloor: diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 999186d4c4ed..e6e31ba4d96c 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1282,8 +1282,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { } break; case aten::rsqrt: { - return computeOneOperand( - "aten_rsqrt", v, [](const ExprHandle& a) { return rsqrt(a); }); + return computeOneOperand("aten_rsqrt", v, [](const ExprHandle& a) { + return rsqrt(promoteIntegerToDefaultType(a)); + }); } break; case aten::abs: { @@ -1531,12 +1532,12 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) { root_stmt->accept(block_analysis.get()); } - // inlining output buffers duplicates computation. it slows down - // cpu code generation but is enabled on gpu because it avoids difficult - // synchronization logic across blocks. - bool inline_output_buffers = + // inlining output & intermediate buffers can duplicate computation. + // it slows down cpu code generation but is enabled on gpu because it avoids + // difficult synchronization logic across blocks. + bool allow_duplicated_work = (backendType == kCudaCodeGen || backendType == kBlockCodeGen); - l.inlineIntermediateBufs(inline_output_buffers); + l.inlineIntermediateBufs(allow_duplicated_work); if (backendType == kCudaCodeGen) { for (auto tensor : tensorOutputs_) { diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index c5f94f16783d..adc3be984216 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -14,7 +14,13 @@ #include #include #include + +#if LLVM_VERSION_MAJOR >= 10 +#include +#else #include +#endif + #include #include #include @@ -533,7 +539,11 @@ void LLVMCodeGenImpl::emitKernel( PM, asmStream, nullptr, +#if LLVM_VERSION_MAJOR >= 10 + llvm::CodeGenFileType::CGFT_AssemblyFile); +#else llvm::TargetMachine::CodeGenFileType::CGFT_AssemblyFile); +#endif PM.run(*module_); } GRAPH_DEBUG( diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 2fed242cf4c6..c2b274a3c9bb 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -23,6 +24,28 @@ namespace torch { namespace jit { namespace tensorexpr { +class FunctionCallUseCount : public IRVisitor { + public: + std::unordered_map findUses(Stmt* s) { + s->accept(this); + return uses_; + } + + private: + void visit(const FunctionCall* v) override { + if (function_calls_[v->tensor()->buf()].insert(v).second) { + uses_[v->tensor()->buf()] = uses_[v->tensor()->buf()] + 1; + } + IRVisitor::visit(v); + } + + std::unordered_map uses_; + + // Sets of FunctionCalls in order to keep the results unique + std::unordered_map> + function_calls_; +}; + class IndexFlattener : public IRMutator { public: Stmt* flatten(Stmt* s) { @@ -751,28 +774,67 @@ bool LoopNest::computeInline(const Buf* b) { return true; } -void LoopNest::inlineIntermediateBufs(bool inline_output_buffers) { +// inlining buffers with multiple uses can create duplicated work, which can +// slow down cpu code generation but is enabled on gpu because it avoids +// difficult synchronization logic across blocks. Inlining trivial reads does +// not duplicate work +void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) { // We need to collect all intermediate buffers as the buffers to be inlined // before calling 'computeInline' since the buffers that are inlined are // erased from the set 'intermediate_bufs_' in that function. - std::unordered_set bufs_to_inline( - intermediate_bufs_.begin(), intermediate_bufs_.end()); + std::unordered_set bufs_to_inline; + + if (allow_duplicated_work) { + bufs_to_inline.insert(intermediate_bufs_.begin(), intermediate_bufs_.end()); + } else { + FunctionCallUseCount fcu; + auto function_call_uses = fcu.findUses(root_stmt_); + auto buf_load_store_uses = findLoadOrStoreUses(root_stmt_); + auto input_bufs = getInputBufs(); + + for (auto buf : intermediate_bufs_) { + TORCH_INTERNAL_ASSERT(buf_load_store_uses.count(buf)); + std::vector& uses = buf_load_store_uses[buf]; + auto stores = c10::filter( + uses, [](const BufLoadOrStoreUse& use) { return use.isStore; }); + + // if the intermediate is the buffer formed from reading in the input + // tensors, always inline, bc we are not duplicating any work + // and avoiding an intermediary buffer + if (stores.size() == 1) { + auto store = dynamic_cast(stores[0].s); + auto input_as_load = dynamic_cast(store->value()); + if (input_as_load && input_bufs.count(input_as_load->buf())) { + bufs_to_inline.insert(buf); + continue; + } + } - // inlining output buffers duplicates computation. it slows down - // cpu code generation but is enabled on gpu because it avoids difficult - // synchronization logic across blocks. - if (inline_output_buffers) { + // all bufs will have at least one store (if they have > 1 they cant be + // inlined anyway) + size_t reads = uses.size() - 1; + size_t function_call_reads = function_call_uses[buf]; + // if only one read, we can inline it without duplicating work + if ((reads + function_call_reads) <= 1) { + bufs_to_inline.insert(buf); + } + } + } + + if (allow_duplicated_work) { bufs_to_inline.insert(output_bufs_.begin(), output_bufs_.end()); } + for (auto b : bufs_to_inline) { computeInline(b); } } // TODO: Unify with DepTracker -class UseFinder : public IRVisitor { +class LoadOrStoreUseFinder : public IRVisitor { public: - std::unordered_map> findUses(Stmt* s) { + std::unordered_map> findUses( + Stmt* s) { uses_.clear(); s->accept(this); return uses_; @@ -794,15 +856,16 @@ class UseFinder : public IRVisitor { } Stmt* last_stmt_ = nullptr; - std::unordered_map> uses_; + std::unordered_map> uses_; // Sets of loads and stores in order to keep the results unique std::unordered_map> loads_; std::unordered_map> stores_; }; -std::unordered_map> findUses(Stmt* s) { - UseFinder uf; +std::unordered_map> +findLoadOrStoreUses(Stmt* s) { + LoadOrStoreUseFinder uf; return uf.findUses(s); } @@ -828,7 +891,7 @@ class ContainedStmtsFinder : public IRVisitor { std::unordered_set contained_; }; -bool containsAll(const std::vector& uses, Block* b) { +bool containsAll(const std::vector& uses, Block* b) { std::unordered_set not_found; for (auto use : uses) { not_found.insert(use.s); @@ -852,7 +915,7 @@ Block* findParentBlock(Stmt* s) { return nullptr; } -Block* findLowestContainingBlock(const std::vector& uses) { +Block* findLowestContainingBlock(const std::vector& uses) { // TODO: we're not using the most efficient algorithm here for simplicity. // Replace with something more performant in case it becomes a bottleneck. Block* b = findParentBlock(uses[0].s); @@ -872,7 +935,8 @@ Stmt* LoopNest::insertAllocFree(Stmt* stmt) { b = new Block({stmt}); } - std::unordered_map> uses = findUses(stmt); + std::unordered_map> uses = + findLoadOrStoreUses(stmt); // Insert allocations and frees for temporary buffers in the innermost // possible scope. for (const Buf* buf : intermediate_bufs_) { diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index 0a588f6a95e4..962d69f0458d 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -53,7 +53,7 @@ class TORCH_API LoopNest { bool computeInline(Stmt* s); bool computeInline(const Buf* b); - void inlineIntermediateBufs(bool inline_output_buffers); + void inlineIntermediateBufs(bool allow_duplicated_work); static void splitWithTail(For* f, int factor); static void splitWithTail( @@ -141,7 +141,7 @@ TORCH_API Stmt* FlattenIndexes(Stmt* s); // TODO: Revisit this once we decide on how dependencies analysis should look // like. Maybe we would choose to use a different API and BufUse would be // removed, or if we decide to keep it we need to properly document its API. -struct BufUse { +struct BufLoadOrStoreUse { Stmt* s; bool isStore; }; @@ -152,7 +152,8 @@ struct BufUse { * in the vectors reflects the order in which the uses appear in the given * statement. */ -std::unordered_map> findUses(Stmt* s); +std::unordered_map> +findLoadOrStoreUses(Stmt* s); } // namespace tensorexpr } // namespace jit diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp index 83b54397b01c..f5e96a501bfd 100644 --- a/torch/csrc/jit/testing/file_check.cpp +++ b/torch/csrc/jit/testing/file_check.cpp @@ -548,7 +548,11 @@ FileCheck* FileCheck::check_count( const std::string& str, size_t count, bool exactly) { - fcImpl->addCheck(CHECK_COUNT, str, count); + TORCH_INTERNAL_ASSERT( + count != 0 || exactly, "Count == 0 && !exactly doesn't do anything"); + if (count) { + fcImpl->addCheck(CHECK_COUNT, str, count); + } if (exactly) { fcImpl->addCheck(CHECK_NOT, str); } diff --git a/torch/csrc/utils/out_types.cpp b/torch/csrc/utils/out_types.cpp new file mode 100644 index 000000000000..0ceeb43bd1f8 --- /dev/null +++ b/torch/csrc/utils/out_types.cpp @@ -0,0 +1,39 @@ +#include + +namespace torch { +namespace utils { + +// Used by python binding codegen to ensure any TensorOptions arguments are consistent +// with the out tensor's options +void check_out_type_matches(const at::Tensor& result, + at::ScalarType scalarType, bool scalarType_is_none, + c10::optional layout, + const at::Device& device, bool device_is_none) { + if (scalarType_is_none && !layout && device_is_none) { // common case + return; + } + if (!scalarType_is_none && result.scalar_type() != scalarType) { + AT_ERROR( + "dtype ", scalarType, + " does not match dtype of out parameter (", result.scalar_type(), ")"); + } + auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType; + auto device_type_arg = device_is_none ? result.device().type() : device.type(); + if (result.scalar_type() != scalarType_arg) { + AT_ERROR( + "scalar type ", scalarType_arg, + " does not match scalar type of out parameter (", result.scalar_type(), ")"); + } + if (layout && result.layout() != *layout) { + AT_ERROR( + "layout ", *layout, + " does not match layout of out parameter (", result.layout(), ")"); + } + if (result.device().type() != device_type_arg) { + AT_ERROR( + "device type ", device_type_arg, + " does not match device type of out parameter (", result.device().type(), ")"); + } +} + +}} diff --git a/torch/csrc/utils/out_types.h b/torch/csrc/utils/out_types.h new file mode 100644 index 000000000000..adc3686a6b97 --- /dev/null +++ b/torch/csrc/utils/out_types.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace torch { +namespace utils { + +TORCH_API void check_out_type_matches( + const at::Tensor& result, + at::ScalarType scalarType, bool scalarType_is_none, + c10::optional layout, + const at::Device& device, bool device_is_none); + +}} diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index af8dda2d767c..4208f653e05d 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -24,6 +24,7 @@ static std::unordered_map type_map = { {"double", ParameterType::DOUBLE}, {"complex", ParameterType::COMPLEX}, {"TensorList", ParameterType::TENSOR_LIST}, + {"c10::List>", ParameterType::TENSOR_LIST}, {"IntArrayRef", ParameterType::INT_LIST}, {"ArrayRef", ParameterType::FLOAT_LIST}, {"Generator", ParameterType::GENERATOR}, @@ -333,7 +334,7 @@ void append_overloaded_arg(std::vector* overloaded_args, PyObject* o bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* overloaded_args) { if (THPVariable_CheckExact(obj)) { - // torch.Tensor instances (not subclasses) + // torch.Tensor instances (not subclasses, except for Parameter) return true; } @@ -861,7 +862,7 @@ bool FunctionSignature::parse(PyObject* self, PyObject* args, PyObject* kwargs, } int i = 0; - if (self != nullptr && !THPVariable_CheckExact(self) && check_has_torch_function(self)) { + if (self != nullptr && check_has_torch_function(self)) { append_overloaded_arg(&this->overloaded_args, self); } for (auto& param : params) { diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 9b7d99014974..0f7f595f57f9 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -160,6 +160,7 @@ struct PythonArgs { inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar); inline std::vector scalarlist(int i); inline std::vector tensorlist(int i); + inline torch::List> list_of_optional_tensors(int i); template inline std::array tensorlist_n(int i); inline std::vector intlist(int i); @@ -327,6 +328,22 @@ inline std::vector PythonArgs::tensorlist(int i) { return res; } +inline torch::List> PythonArgs::list_of_optional_tensors(int i) { + if (!args[i]) return torch::List>(); + auto tuple = six::isTuple(args[i]); + THPObjectPtr arg = six::maybeAsTuple(args[i]); + auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get()); + torch::List> res; + res.reserve(size); + for (int idx = 0; idx < size; idx++) { + PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx) : PyList_GET_ITEM(arg.get(), idx); + // This is checked by the argument parser so it's safe to cast without checking + // if this is a tensor first + res.push_back(reinterpret_cast(obj)->cdata); + } + return res; +} + template inline std::array PythonArgs::tensorlist_n(int i) { auto res = std::array(); diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 8ee83fa81fe7..5535cef78395 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -271,6 +271,9 @@ def get_device_name(device: Optional[_device_t] = None) -> str: name. This function is a no-op if this argument is a negative integer. It uses the current device, given by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None`` (default). + + Returns: + str: the name of the device """ return get_device_properties(device).name @@ -293,6 +296,15 @@ def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int] def get_device_properties(device: _device_t) -> _CudaDeviceProperties: + r"""Gets the properties of a device. + + Args: + device (torch.device or int or str): device for which to return the + properties of the device. + + Returns: + _CudaDeviceProperties: the properties of the device + """ _lazy_init() # will define _get_device_properties device = _get_device_index(device, optional=True) if device < 0 or device >= device_count(): diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index a8517a4bb394..5b300452f6d3 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1,8 +1,8 @@ +import contextlib +import logging import pickle import torch import warnings -import contextlib -import sys import time from torch._six import string_classes from datetime import timedelta @@ -17,8 +17,8 @@ AllreduceOptions, AllreduceCoalescedOptions, AllToAllOptions, + BarrierOptions, BroadcastOptions, - FileStore, GatherOptions, PrefixStore, ProcessGroup, @@ -27,15 +27,8 @@ ReduceScatterOptions, ScatterOptions, Store, - TCPStore, ) -if sys.platform != 'win32': - from torch._C._distributed_c10d import ( - HashStore, - ) - - _MPI_AVAILABLE = True _NCCL_AVAILABLE = True _GLOO_AVAILABLE = True @@ -191,16 +184,35 @@ def _store_based_barrier(rank, store, timeout): """ store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _group_count) store.add(store_key, 1) + logging.info('Added key: {} to store for rank: {}'.format(store_key, rank)) # Now wait for all workers to check in with the store. world_size = get_world_size() - worker_count = int(store.get(store_key)) + # Use 'add' instead of 'get' since for some store implementations 'add' + # doesn't work well with 'get'. Ideally the store implementations should + # be fixed, but for backward compatiblity reasons it is risky to change + # the store implementations. Once, we completely migrate away from these + # legacy stores, we can use 'get' here instead. + worker_count = store.add(store_key, 0) start = time.time() + log_time = time.time() while worker_count != world_size: time.sleep(0.01) - worker_count = int(store.get(store_key)) + worker_count = store.add(store_key, 0) + + # Print status periodically to keep track. + if timedelta(seconds=(time.time() - log_time)) > timedelta(seconds=10): + logging.info( + "Waiting in store based barrier to initialize process group for " + "rank: {}, key: {} (world_size={}, worker_count={}, timeout={})".format( + rank, store_key, world_size, worker_count, timeout)) + log_time = time.time() + if timedelta(seconds=(time.time() - start)) > timeout: - raise RuntimeError("Timed out initializing process group") + raise RuntimeError( + "Timed out initializing process group in store based barrier on " + "rank: {}, for key: {} (world_size={}, worker_count={}, timeout={})".format( + rank, store_key, world_size, worker_count, timeout)) def _rank_not_in_group(group: ProcessGroup): """ @@ -504,12 +516,8 @@ def init_process_group(backend, # barrier at the end to ensure that once we return from this method, all # process groups including global variables are updated correctly on all # ranks. - if backend == Backend.MPI or not ( - isinstance(store, TCPStore) or - isinstance(store, FileStore) or - (sys.platform != 'win32' and isinstance(store, HashStore)) - ): - # MPI doesn't have store. + if backend == Backend.MPI: + # MPI backend doesn't use store. barrier() else: # Use store based barrier here since barrier() used a bunch of @@ -2370,8 +2378,11 @@ def all_to_all(output_tensor_list, work.wait() + def barrier(group=GroupMember.WORLD, - async_op=False): + async_op=False, + device_ids=None): + """ Synchronizes all processes. @@ -2382,6 +2393,8 @@ def barrier(group=GroupMember.WORLD, group (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. async_op (bool, optional): Whether this op should be an async op + device_ids ([int], optional): List of device/GPU ids. + Valid only for NCCL backend. Returns: Async work handle, if async_op is set to True. @@ -2390,11 +2403,22 @@ def barrier(group=GroupMember.WORLD, if _rank_not_in_group(group): return + opts = BarrierOptions() + if device_ids is not None: + if get_backend(group) != Backend.NCCL: + raise RuntimeError("Function argument device_ids not supported " + "for the selected backend {}".format(get_backend(group))) + if isinstance(device_ids, list): + opts.device_ids = device_ids + else: + raise RuntimeError("Invalid function argument: " + "device_ids type should be List[int]") + if group is None: default_pg = _get_default_group() - work = default_pg.barrier() + work = default_pg.barrier(opts=opts) else: - work = group.barrier() + work = group.barrier(opts=opts) if async_op: return work @@ -2491,16 +2515,12 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None): # barrier at the end to ensure that once we return from this method, all # process groups including global variables are updated correctly on all # ranks. - if backend == Backend.MPI or not ( - isinstance(default_store, TCPStore) or - isinstance(default_store, FileStore) or - (sys.platform != 'win32' and isinstance(default_store, HashStore)) - ): + if backend == Backend.MPI: # MPI doesn't have store. barrier() else: # Use store based barrier here since barrier() used a bunch of # default devices and messes up NCCL internal state. - _store_based_barrier(group_rank, default_store, timeout) + _store_based_barrier(global_rank, default_store, timeout) return pg diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py index 6cd7b168ec6a..d8de89bfc937 100644 --- a/torch/distributed/rpc/server_process_global_profiler.py +++ b/torch/distributed/rpc/server_process_global_profiler.py @@ -116,6 +116,7 @@ def __enter__(self): profiler_kind, self.record_shapes, self.profile_memory, + False, False) _enable_server_process_global_profiler(profiler_config) return self diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py index 50be941e073a..63181a2a6733 100644 --- a/torch/distributions/cauchy.py +++ b/torch/distributions/cauchy.py @@ -69,8 +69,6 @@ def cdf(self, value): return torch.atan((value - self.loc) / self.scale) / math.pi + 0.5 def icdf(self, value): - if self._validate_args: - self._validate_sample(value) return torch.tan(math.pi * (value - 0.5)) * self.scale + self.loc def entropy(self): diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py index 630c192ffed0..87d72d52d26b 100644 --- a/torch/distributions/constraints.py +++ b/torch/distributions/constraints.py @@ -3,13 +3,17 @@ - ``constraints.boolean`` - ``constraints.cat`` +- ``constraints.corr_cholesky`` - ``constraints.dependent`` - ``constraints.greater_than(lower_bound)`` +- ``constraints.greater_than_eq(lower_bound)`` - ``constraints.integer_interval(lower_bound, upper_bound)`` - ``constraints.interval(lower_bound, upper_bound)`` +- ``constraints.less_than(upper_bound)`` - ``constraints.lower_cholesky`` - ``constraints.lower_triangular`` - ``constraints.nonnegative_integer`` +- ``constraints.one_hot`` - ``constraints.positive`` - ``constraints.positive_definite`` - ``constraints.positive_integer`` @@ -57,6 +61,8 @@ class Constraint(object): A constraint object represents a region over which a variable is valid, e.g. within which a variable can be optimized. """ + is_discrete = False + def check(self, value): """ Returns a byte tensor of `sample_shape + batch_shape` indicating @@ -103,14 +109,30 @@ class _Boolean(Constraint): """ Constrain to the two values `{0, 1}`. """ + is_discrete = True + def check(self, value): return (value == 0) | (value == 1) +class _OneHot(Constraint): + """ + Constrain to one-hot vectors. + """ + is_discrete = True + + def check(self, value): + is_boolean = (value == 0) | (value == 1) + is_normalized = value.sum(-1).eq(1) + return is_boolean.all(-1) & is_normalized + + class _IntegerInterval(Constraint): """ Constrain to an integer interval `[lower_bound, upper_bound]`. """ + is_discrete = True + def __init__(self, lower_bound, upper_bound): self.lower_bound = lower_bound self.upper_bound = upper_bound @@ -128,6 +150,8 @@ class _IntegerLessThan(Constraint): """ Constrain to an integer interval `(-inf, upper_bound]`. """ + is_discrete = True + def __init__(self, upper_bound): self.upper_bound = upper_bound @@ -144,6 +168,8 @@ class _IntegerGreaterThan(Constraint): """ Constrain to an integer interval `[lower_bound, inf)`. """ + is_discrete = True + def __init__(self, lower_bound): self.lower_bound = lower_bound @@ -358,6 +384,7 @@ def check(self, value): dependent = _Dependent() dependent_property = _DependentProperty boolean = _Boolean() +one_hot = _OneHot() nonnegative_integer = _IntegerGreaterThan(0) positive_integer = _IntegerGreaterThan(1) integer_interval = _IntegerInterval diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py index 180fbd8187ee..5d3d48840203 100644 --- a/torch/distributions/continuous_bernoulli.py +++ b/torch/distributions/continuous_bernoulli.py @@ -168,8 +168,6 @@ def cdf(self, value): torch.where(torch.ge(value, 1.0), torch.ones_like(value), unbounded_cdfs)) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) cut_probs = self._cut_probs() return torch.where( self._outside_unstable_region(), diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py index f16eb154e2dd..bc61e0b0584e 100644 --- a/torch/distributions/distribution.py +++ b/torch/distributions/distribution.py @@ -12,10 +12,21 @@ class Distribution(object): has_rsample = False has_enumerate_support = False - _validate_args = False + _validate_args = __debug__ @staticmethod def set_default_validate_args(value): + """ + Sets whether validation is enabled or disabled. + + The default behavior mimics Python's ``assert`` statement: validation + is on by default, but is disabled if Python is run in optimized mode + (via ``python -O``). Validation may be expensive, so you may want to + disable it once a model is working. + + Args: + value (bool): Whether to enable validation. + """ if value not in [True, False]: raise ValueError Distribution._validate_args = value diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py index 41d7cd9f9787..ac18980c778b 100644 --- a/torch/distributions/exponential.py +++ b/torch/distributions/exponential.py @@ -68,8 +68,6 @@ def cdf(self, value): return 1 - torch.exp(-self.rate * value) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) return -torch.log(1 - value) / self.rate def entropy(self): diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py index d7ec01c65b35..a505d60c8f38 100644 --- a/torch/distributions/laplace.py +++ b/torch/distributions/laplace.py @@ -75,8 +75,6 @@ def cdf(self, value): return 0.5 - 0.5 * (value - self.loc).sign() * torch.expm1(-(value - self.loc).abs() / self.scale) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) term = value - 0.5 return self.loc - self.scale * (term).sign() * torch.log1p(-2 * term.abs()) diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py index 051725db19ca..4a8babb34a7c 100644 --- a/torch/distributions/negative_binomial.py +++ b/torch/distributions/negative_binomial.py @@ -77,8 +77,10 @@ def param_shape(self): @lazy_property def _gamma(self): + # Note we avoid validating because self.total_count can be zero. return torch.distributions.Gamma(concentration=self.total_count, - rate=torch.exp(-self.logits)) + rate=torch.exp(-self.logits), + validate_args=False) def sample(self, sample_shape=torch.Size()): with torch.no_grad(): diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py index 2468e2f225dc..1f14f0ae015f 100644 --- a/torch/distributions/normal.py +++ b/torch/distributions/normal.py @@ -82,8 +82,6 @@ def cdf(self, value): return 0.5 * (1 + torch.erf((value - self.loc) * self.scale.reciprocal() / math.sqrt(2))) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) return self.loc + self.scale * torch.erfinv(2 * value - 1) * math.sqrt(2) def entropy(self): diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py index c661a245f716..64f696802d76 100644 --- a/torch/distributions/one_hot_categorical.py +++ b/torch/distributions/one_hot_categorical.py @@ -29,7 +29,7 @@ class OneHotCategorical(Distribution): """ arg_constraints = {'probs': constraints.simplex, 'logits': constraints.real} - support = constraints.simplex + support = constraints.one_hot has_enumerate_support = True def __init__(self, probs=None, logits=None, validate_args=None): diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py index b212c52695c2..edaf5abf77a5 100644 --- a/torch/distributions/uniform.py +++ b/torch/distributions/uniform.py @@ -81,8 +81,6 @@ def cdf(self, value): return result.clamp(min=0, max=1) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) result = value * (self.high - self.low) + self.low return result diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 8f07f42529aa..fd0087dca398 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -577,7 +577,9 @@ def python_code(self, root_module: str) -> str: free_vars: List[str] = [] modules_used : Set[str] = set() body: List[str] = [] - maybe_return_annotation : str = '' + + # Wrap string in list to pass by reference + maybe_return_annotation : List[str] = [''] def register_modules_used(qualified_name : str): if '.' in qualified_name: @@ -675,7 +677,7 @@ def emit_node(node : Node): return elif node.op == 'output': if node.type is not None: - maybe_return_annotation = f" -> {type_repr(node.type)}" + maybe_return_annotation[0] = f" -> {type_repr(node.type)}" body.append(f'return {repr(node.args[0])}') return raise NotImplementedError(f'node: {node.op} {node.target}') @@ -695,7 +697,7 @@ def emit_node(node : Node): code = '\n'.join(' ' + line for line in code.split('\n')) + '\n' fn_code = f"""\ {import_block} -def forward(self, {', '.join(free_vars)}){maybe_return_annotation}: +def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}: {code} """ diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index f2b0c5c53a99..cfd327165899 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -44,6 +44,7 @@ from torch.jit._serialization import save, load from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph +from torch.jit.cuda import stream from torch.jit._freeze import freeze # For backwards compatibility diff --git a/torch/jit/_async.py b/torch/jit/_async.py index 26bc6eeada67..ae9684a0e229 100644 --- a/torch/jit/_async.py +++ b/torch/jit/_async.py @@ -17,7 +17,7 @@ def fork(func, *args, **kwargs): - """ + r""" Creates an asynchronous task executing `func` and a reference to the value of the result of this execution. `fork` will return immediately, so the return value of `func` may not have been computed yet. To force completion @@ -42,7 +42,8 @@ def fork(func, *args, **kwargs): Example (fork a free function): - .. testcode:: + .. code-block:: python + import torch from torch import Tensor def foo(a : Tensor, b : int) -> Tensor: @@ -60,16 +61,17 @@ def bar(a): Example (fork a module method): - .. testcode:: + .. code-block:: python + import torch from torch import Tensor - class SubMod(torch.nn.Module): + class AddMod(torch.nn.Module): def forward(self, a: Tensor, b : int): return a + b class Mod(torch.nn.Module): def __init__(self): super(self).__init__() - self.mod = SubMod() + self.mod = AddMod() def forward(self, input): fut = torch.jit.fork(self.mod, a, b=2) return torch.jit.wait(fut) @@ -81,7 +83,7 @@ def forward(self, input): def wait(future): - """ + r""" Forces completion of a `torch.jit.Future[T]` asynchronous task, returning the result of the task. See :func:`~fork` for docs and examples. Args: diff --git a/torch/jit/_script.py b/torch/jit/_script.py index b391d88a88b1..bdf00e21c515 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -743,6 +743,43 @@ class RecursiveScriptModule(ScriptModule): # type: ignore def __init__(self, arg=None): super().__init__() +def call_prepare_scriptable_func_impl(obj, memo): + if not isinstance(obj, torch.nn.Module): + return obj + + obj_id = id(obj) + + # If obj_id is in memo, obj has already been prepared or is being + # prepared in another call up the stack. + if obj_id in memo: + return memo[id(obj)] + + obj = obj.__prepare_scriptable__() if hasattr(obj, '__prepare_scriptable__') else obj # type: ignore + # Record obj in memo to avoid infinite recursion in the case of cycles in the module + # hierarchy when recursing below. + memo[obj_id] = obj + + new_obj_dict = {} + + for name in obj.__dict__: + sub_module = obj.__dict__.get(name) + if name == '_modules': + for k, v in sub_module.items(): + sub_module[k] = call_prepare_scriptable_func_impl(v, memo) + new_obj_dict[name] = sub_module + elif isinstance(sub_module, torch.nn.Module) and not isinstance(sub_module, ScriptModule): + new_obj_dict[name] = call_prepare_scriptable_func_impl(sub_module, memo) + else: + new_obj_dict[name] = sub_module + + for k, v in new_obj_dict.items(): + obj.__dict__[name] = v + + return obj + +def call_prepare_scriptable_func(obj): + memo: Dict[int, torch.nn.Module] = {} + return call_prepare_scriptable_func_impl(obj, memo) def script(obj, optimize=None, _frames_up=0, _rcb=None): r""" @@ -896,6 +933,7 @@ def forward(self, input): return obj if isinstance(obj, torch.nn.Module): + obj = call_prepare_scriptable_func(obj) return torch.jit._recursive.create_script_module( obj, torch.jit._recursive.infer_methods_to_compile ) diff --git a/torch/jit/cuda.py b/torch/jit/cuda.py new file mode 100644 index 000000000000..16805301600b --- /dev/null +++ b/torch/jit/cuda.py @@ -0,0 +1,182 @@ +# mypy: ignore-errors + +r""" +This package adds support for JIT compilation for CUDA Streams and events, +This is similar to API's available in the eager mode +:ref:`cuda-semantics` has more details about working with CUDA. +""" + +import torch +from typing import Optional, Any +from torch import device as _device + +def get_current_device_index() -> int: + r"""Checks if there are CUDA devices available and + returns the device index of the current default CUDA device. + Returns -1 in case there are no CUDA devices available. + + Arguments: ``None`` + """ + if torch.cuda.device_count() > 0: + return torch.cuda._current_device() + return -1 + +def get_device_index(device: Optional[_device] = None, optional: bool = False, allow_cpu: bool = False) -> int: + r"""Gets the device index from :attr:`device`, which can be a torch.device + object, a Python integer, or ``None``. + + If :attr:`device` is a torch.device object, returns the device index if it + is a CUDA device. Note that for a CUDA device without a specified index, + , this will return the current default CUDA device if :attr:`optional` is ``True``. + If :attr:`allow_cpu` is ``True``,CPU devices will be accepted and ``-1`` will be + returned in this case. + + If :attr:`device` is a Python integer, it is returned as is. + + If :attr:`device` is ``None``, this will return the current default CUDA + device if :attr:`optional` is ``True``. + """ + if device is None: + if optional: + return get_current_device_index() + else: + raise ValueError('Expected a torch.device with a specified index ' + f'or an integer, but got: {device}') + device_index = -1 + if isinstance(device, str): + device = torch.device(device) + + if isinstance(device, torch.device): + if not allow_cpu and device.type == 'cpu': + raise ValueError(f'Expected a non cpu device, but got: {device}') + device_index = -1 if device.type == 'cpu' else torch.cuda.device_index(device) + + if isinstance(device, int): + device_index = device + + return device_index + +class device(object): + r"""Context-manager that changes the selected device. + This is similar to device (torch.device or int), but has been + introduced for JIT compatibility. + Arguments: + device (torch.device or int): device index to select. It's a no-op if + this argument is a negative integer or ``None``. + """ + def __init__(self, device: Optional[_device]): + self.idx = -1 + self.prev_idx = -1 + self.device = device + + def __enter__(self): + self.idx = get_device_index(self.device, optional=True) + + if self.idx == -1: + return + self.prev_idx = torch.cuda._current_device() + + if self.prev_idx != self.idx: + torch.cuda._set_device(self.idx) + + def __exit__(self, type: Any, value: Any, traceback: Any): + if self.prev_idx != self.idx: + torch.cuda._set_device(self.prev_idx) + +class StreamContext(object): + r"""Context-manager that selects a given stream. + All CUDA kernels queued within its context will be enqueued on a selected + stream. + Arguments: + StreamContext (Stream): selected stream. This manager is a no-op if it's + ``None``. + .. note:: Streams are per-device. If the selected stream is not on the + current device, this function will also change the current device to + match the stream. + """ + cur_stream : Optional['torch.classes.cuda.Stream'] + + def __init__(self, stream: Optional['torch.classes.cuda.Stream']): + self.idx = -1 + self.stream = stream + # Initialize the below streams to default stream on the current device + self.device_index = get_current_device_index() + self.src_prev_stream = torch.cuda.default_stream(self.device_index) + self.dst_prev_stream = torch.cuda.default_stream(self.device_index) + + def __enter__(self): + self.idx = get_device_index(device=None, optional=True) + # If there is no CUDA device available, return + if self.idx == -1: + return + + # Local cur_stream variable for type refinement + cur_stream = self.stream + # Return if stream is None + if cur_stream is None: + return + self.src_prev_stream = torch.cuda.current_stream(self.idx) + # If the stream is not on the current device, then change the device + # and set the current stream on the device + if self.src_prev_stream.device_index() != cur_stream.device_index(): + with device(cur_stream.device()): + self.dst_prev_stream = torch.cuda.current_stream(cur_stream.device_index()) + torch.cuda._set_device(cur_stream.device_index()) + torch.cuda.set_stream(cur_stream) + + def __exit__(self, type: Any, value: Any, traceback: Any): + # Local cur_stream variable for type refinement + cur_stream = self.stream + # If stream is None or no CUDA device available, return + if cur_stream is None or self.idx == -1: + return + # If the stream was not on the current device, restore the previous stream on + # the destination device and also reset the current device to the previous device. + # Set the current stream on the device to the src_prev_stream + if self.src_prev_stream.device_index() != cur_stream.device_index(): + torch.cuda.set_stream(self.dst_prev_stream) + torch.cuda._set_device(self.idx) + torch.cuda.set_stream(self.src_prev_stream) + +def stream(stream: Optional['torch.classes.cuda.Stream']) -> StreamContext: + r"""Wrapper around the Context-manager that selects a given stream. + All CUDA kernels queued within its context will be enqueued on a selected + stream. + Arguments: + stream (Stream): selected stream. This manager is a no-op if it's + ``None``. + """ + return StreamContext(stream) + +def Stream(device: int = -1, priority: int = 0) -> 'torch.classes.cuda.Stream': + r"""Wrapper around a CUDA stream. + A CUDA stream is a linear sequence of execution that belongs to a specific + device, independent from other streams. See :ref:`cuda-semantics` for + details. + Arguments: + device(int, optional): a device on which to allocate + the stream. If :attr:`device` is ``None`` (default) or a negative + integer, this will use the current device. + priority(int, optional): priority of the stream. Can be either + -1 (high priority) or 0 (low priority). By default, streams have + priority 0. + .. note:: Although CUDA versions >= 11 support more than two levels of + priorities, in PyTorch, we only support two levels of priorities. + """ + return torch.classes.cuda.Stream(device, priority) + +def Event(enable_timing: bool = False, blocking: bool = False, interprocess: bool = False) -> 'torch.classes.cuda.Event': + r"""Wrapper around a CUDA event. + CUDA events are synchronization markers that can be used to monitor the + device's progress, to accurately measure timing, and to synchronize CUDA + streams. + Arguments: + enable_timing (bool, optional): indicates if the event should measure time + (default: ``False``) + blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``) + interprocess (bool): if ``True``, the event can be shared between processes + (default: ``False``) + .. _CUDA Event Documentation: + https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html + """ + return torch.classes.cuda.Event(enable_timing, blocking, interprocess) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 01ce71afd388..b9ac5aa77150 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1409,7 +1409,13 @@ c10::intrusive_ptr ProcessGroupNCCL::reduce_scatter( c10::intrusive_ptr ProcessGroupNCCL::barrier( const BarrierOptions& opts) { std::vector devices; - if (usedDeviceIdxs_.empty()) { + + // Use user defined GPU device ids if provided + if (!opts.device_ids.empty()) { + for (auto device : opts.device_ids) { + devices.push_back(at::Device(at::DeviceType::CUDA, device)); + } + } else if (usedDeviceIdxs_.empty()) { // This means there is not yet a NCCL collective being called // Here we have to use the best guesses and will use a single GPU to call // allreduce to achieve barrier. diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp index 03b2e59e4295..a5a0d5fa20df 100644 --- a/torch/lib/c10d/Types.hpp +++ b/torch/lib/c10d/Types.hpp @@ -62,6 +62,7 @@ struct AllToAllOptions { }; struct BarrierOptions { + std::vector device_ids; std::chrono::milliseconds timeout = kUnsetTimeout; }; diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index 8a16c8c27808..073c95c28619 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -365,11 +365,11 @@ class SiLU(Module): \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.} .. note:: - See `Gaussian Error Linear Units (GELUs) `_ - where the SiLU (Sigmoid Linear Unit) was originally coined, and see - `Sigmoid-Weighted Linear Units for Neural Network Function Approximation - in Reinforcement Learning `_ and `Swish: - a Self-Gated Activation Function `_ + See `Gaussian Error Linear Units (GELUs) `_ + where the SiLU (Sigmoid Linear Unit) was originally coined, and see + `Sigmoid-Weighted Linear Units for Neural Network Function Approximation + in Reinforcement Learning `_ and `Swish: + a Self-Gated Activation Function `_ where the SiLU was experimented with later. Shape: @@ -848,8 +848,9 @@ class MultiheadAttention(Module): kdim: total number of features in key. Default: None. vdim: total number of features in value. Default: None. - Note: if kdim and vdim are None, they will be set to embed_dim such that - query, key, and value have the same number of features. + Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set + to :attr:`embed_dim` such that query, key, and value have the same + number of features. Examples:: @@ -937,8 +938,7 @@ def forward(self, query, key, value, key_padding_mask=None, attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all the batches while a 3D mask allows to specify a different mask for the entries of each batch. - Shape: - - Inputs: + Shapes for inputs: - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is @@ -949,15 +949,17 @@ def forward(self, query, key, value, key_padding_mask=None, If a ByteTensor is provided, the non-zero positions will be ignored while the position with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. - - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length. - 3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length, - S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked - positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + - attn_mask: if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the + source sequence length. + + If a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)` where N is the batch size, L is the target sequence + length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend + the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor is provided, it will be added to the attention weight. - - Outputs: + Shapes for outputs: - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. - attn_output_weights: :math:`(N, L, S)` where N is the batch size, diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py index c06b7a5534f6..dd491ba99620 100644 --- a/torch/nn/modules/flatten.py +++ b/torch/nn/modules/flatten.py @@ -2,7 +2,7 @@ from typing import Tuple, Union from torch import Tensor -from torch import Size +from torch.types import _size class Flatten(Module): @@ -53,8 +53,8 @@ class Unflatten(Module): be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively. * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be - a `tuple` of ints or `torch.Size` for `Tensor` input or a `NamedShape` (tuple of `(name, size)` tuples) - for `NamedTensor` input. + a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input; a `NamedShape` + (tuple of `(name, size)` tuples) for `NamedTensor` input. Shape: - Input: :math:`(N, *dims)` @@ -62,7 +62,7 @@ class Unflatten(Module): Args: dim (Union[int, str]): Dimension to be unflattened - unflattened_size (Union[torch.Size, NamedShape]): New shape of the unflattened dimension + unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension Examples: >>> input = torch.randn(2, 50) @@ -71,7 +71,7 @@ class Unflatten(Module): >>> nn.Linear(50, 50), >>> nn.Unflatten(1, (2, 5, 5)) >>> ) - >>> output = m(output) + >>> output = m(input) >>> output.size() torch.Size([2, 2, 5, 5]) >>> # With torch.Size @@ -79,15 +79,13 @@ class Unflatten(Module): >>> nn.Linear(50, 50), >>> nn.Unflatten(1, torch.Size([2, 5, 5])) >>> ) - >>> output = m(output) + >>> output = m(input) >>> output.size() torch.Size([2, 2, 5, 5]) >>> # With namedshape (tuple of tuples) - >>> m = nn.Sequential( - >>> nn.Linear(50, 50), - >>> nn.Unflatten('features', (('C', 2), ('H', 50), ('W',50))) - >>> ) - >>> output = m(output) + >>> input = torch.randn(2, 50, names=('N', 'features')) + >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5))) + >>> output = unflatten(input) >>> output.size() torch.Size([2, 2, 5, 5]) """ @@ -95,9 +93,9 @@ class Unflatten(Module): __constants__ = ['dim', 'unflattened_size'] dim: Union[int, str] - unflattened_size: Union[Size, NamedShape] + unflattened_size: Union[_size, NamedShape] - def __init__(self, dim: Union[int, str], unflattened_size: Union[Size, NamedShape]) -> None: + def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None: super(Unflatten, self).__init__() if isinstance(dim, int): @@ -121,7 +119,7 @@ def _require_tuple_tuple(self, input): "but found type {}".format(type(input).__name__)) def _require_tuple_int(self, input): - if (isinstance(input, tuple)): + if (isinstance(input, (tuple, list))): for idx, elem in enumerate(input): if not isinstance(elem, int): raise TypeError("unflattened_size must be tuple of ints, " + diff --git a/torch/nn/quantizable/__init__.py b/torch/nn/quantizable/__init__.py new file mode 100644 index 000000000000..270dcebaa5f4 --- /dev/null +++ b/torch/nn/quantizable/__init__.py @@ -0,0 +1 @@ +from .modules import * diff --git a/torch/nn/quantizable/modules/__init__.py b/torch/nn/quantizable/modules/__init__.py new file mode 100644 index 000000000000..b3480b717a2d --- /dev/null +++ b/torch/nn/quantizable/modules/__init__.py @@ -0,0 +1,7 @@ +from .rnn import LSTM +from .rnn import LSTMCell + +__all__ = [ + 'LSTM', + 'LSTMCell', +] diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py new file mode 100644 index 000000000000..cfe076fac16c --- /dev/null +++ b/torch/nn/quantizable/modules/rnn.py @@ -0,0 +1,403 @@ +import numbers +from typing import Optional, Tuple +import warnings + +import torch +from torch import Tensor + +""" +We will recreate all the RNN modules as we require the modules to be decomposed +into its building blocks to be able to observe. +""" + +class LSTMCell(torch.nn.Module): + r"""A quantizable long short-term memory (LSTM) cell. + + For the description and the argument types, please, refer to :class:`~torch.nn.LSTMCell` + + Examples:: + + >>> import torch.nn.quantizable as nnqa + >>> rnn = nnqa.LSTMCell(10, 20) + >>> input = torch.randn(3, 10) + >>> hx = torch.randn(3, 20) + >>> cx = torch.randn(3, 20) + >>> output = [] + >>> for i in range(6): + hx, cx = rnn(input[i], (hx, cx)) + output.append(hx) + """ + _FLOAT_MODULE = torch.nn.LSTMCell + + def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True): + super().__init__() + self.input_size = input_dim + self.hidden_size = hidden_dim + self.bias = bias + + self.igates = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=bias) + self.hgates = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=bias) + self.gates = torch.nn.quantized.FloatFunctional() + + self.fgate_cx = torch.nn.quantized.FloatFunctional() + self.igate_cgate = torch.nn.quantized.FloatFunctional() + self.fgate_cx_igate_cgate = torch.nn.quantized.FloatFunctional() + + self.ogate_cy = torch.nn.quantized.FloatFunctional() + + def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]: + if hidden is None or hidden == (None, None): + hidden = self.initialize_hidden(x.shape[0], x.is_quantized) + hx, cx = hidden + + igates = self.igates(x) + hgates = self.hgates(hx) + gates = self.gates.add(igates, hgates) + + input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1) + + input_gate = torch.sigmoid(input_gate) + forget_gate = torch.sigmoid(forget_gate) + cell_gate = torch.tanh(cell_gate) + out_gate = torch.sigmoid(out_gate) + + fgate_cx = self.fgate_cx.mul(forget_gate, cx) + igate_cgate = self.igate_cgate.mul(input_gate, cell_gate) + fgate_cx_igate_cgate = self.fgate_cx_igate_cgate.add(fgate_cx, igate_cgate) + cy = fgate_cx_igate_cgate + + tanh_cy = torch.tanh(cy) + hy = self.ogate_cy.mul(out_gate, tanh_cy) + return hy, cy + + def initialize_hidden(self, batch_size: int, is_quantized: bool = False) -> Tuple[Tensor, Tensor]: + h, c = torch.zeros((batch_size, self.hidden_size)), torch.zeros((batch_size, self.hidden_size)) + if is_quantized: + h = torch.quantize_per_tensor(h, scale=1.0, zero_point=0, dtype=torch.quint8) + c = torch.quantize_per_tensor(c, scale=1.0, zero_point=0, dtype=torch.quint8) + return h, c + + def _get_name(self): + return 'QuantizableLSTMCell' + + @classmethod + def from_params(cls, wi, wh, bi=None, bh=None): + """Uses the weights and biases to create a new LSTM cell. + + Args: + wi, wh: Weights for the input and hidden layers + bi, bh: Biases for the input and hidden layers + """ + assert (bi is None) == (bh is None) # Either both None or both have values + input_size = wi.shape[1] + hidden_size = wh.shape[1] + cell = cls(input_dim=input_size, hidden_dim=hidden_size, + bias=(bi is not None)) + cell.igates.weight = torch.nn.Parameter(wi) + if bi is not None: + cell.igates.bias = torch.nn.Parameter(bi) + cell.hgates.weight = torch.nn.Parameter(wh) + if bh is not None: + cell.hgates.bias = torch.nn.Parameter(bh) + return cell + + @classmethod + def from_float(cls, other): + assert type(other) == cls._FLOAT_MODULE + assert hasattr(other, 'qconfig'), "The float module must have 'qconfig'" + observed = cls.from_params(other.weight_ih, other.weight_hh, + other.bias_ih, other.bias_hh) + observed.qconfig = other.qconfig + observed.igates.qconfig = other.qconfig + observed.hgates.qconfig = other.qconfig + return observed + + +class _LSTMSingleLayer(torch.nn.Module): + r"""A single one-directional LSTM layer. + + The difference between a layer and a cell is that the layer can process a + sequence, while the cell only expects an instantaneous value. + """ + def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True): + super().__init__() + self.cell = LSTMCell(input_dim, hidden_dim, bias=bias) + + def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): + result = [] + for xx in x: + hidden = self.cell(xx, hidden) + result.append(hidden[0]) # type: ignore + result_tensor = torch.stack(result, 0) + return result_tensor, hidden + + @classmethod + def from_params(cls, *args, **kwargs): + cell = LSTMCell.from_params(*args, **kwargs) + layer = cls(cell.input_size, cell.hidden_size, cell.bias) + layer.cell = cell + return layer + + +class _LSTMLayer(torch.nn.Module): + r"""A single bi-directional LSTM layer.""" + def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True, + batch_first: bool = False, bidirectional: bool = False): + super().__init__() + self.batch_first = batch_first + self.bidirectional = bidirectional + self.layer_fw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias) + if self.bidirectional: + self.layer_bw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias) + + def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): + if self.batch_first: + x = x.transpose(0, 1) + if hidden is None: + hx_fw, cx_fw = (None, None) + else: + hx_fw, cx_fw = hidden + if self.bidirectional: + if hx_fw is None: + hx_bw = None + else: + hx_bw = hx_fw[1] + hx_fw = hx_fw[0] + if cx_fw is None: + cx_bw = None + else: + cx_bw = cx_fw[1] + cx_fw = cx_fw[0] + hidden_bw = hx_bw, cx_bw + hidden_fw = hx_fw, cx_fw + result_fw, hidden_fw = self.layer_fw(x, hidden_fw) + + if self.bidirectional: + x_reversed = x.flip(0) + result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw) + result_bw = result_bw.flip(0) + + result = torch.cat([result_fw, result_bw], result_fw.dim() - 1) + h = torch.stack([hidden_fw[0], hidden_bw[0]], 0) # type: ignore + c = torch.stack([hidden_fw[1], hidden_bw[1]], 0) # type: ignore + else: + result = result_fw + h, c = hidden_fw # type: ignore + + if self.batch_first: + result.transpose_(0, 1) + + return result, (h, c) + + @classmethod + def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs): + r""" + There is no FP equivalent of this class. This function is here just to + mimic the behavior of the `prepare` within the `torch.quantization` + flow. + """ + assert hasattr(other, 'qconfig') or (qconfig is not None) + + input_size = kwargs.get('input_size', other.input_size) + hidden_size = kwargs.get('hidden_size', other.hidden_size) + bias = kwargs.get('bias', other.bias) + batch_first = kwargs.get('batch_first', other.batch_first) + bidirectional = kwargs.get('bidirectional', other.bidirectional) + + layer = cls(input_size, hidden_size, bias, batch_first, bidirectional) + layer.qconfig = getattr(other, 'qconfig', qconfig) + wi = getattr(other, f'weight_ih_l{layer_idx}') + wh = getattr(other, f'weight_hh_l{layer_idx}') + bi = getattr(other, f'bias_ih_l{layer_idx}', None) + bh = getattr(other, f'bias_hh_l{layer_idx}', None) + + layer.layer_fw = _LSTMSingleLayer.from_params(wi, wh, bi, bh) + + if other.bidirectional: + wi = getattr(other, f'weight_ih_l{layer_idx}_reverse') + wh = getattr(other, f'weight_hh_l{layer_idx}_reverse') + bi = getattr(other, f'bias_ih_l{layer_idx}_reverse', None) + bh = getattr(other, f'bias_hh_l{layer_idx}_reverse', None) + layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh) + return layer + + # Getters for the weights and biases + # Note that jit currently doesn't support the `porperty`, so if you need to + # access the weights/biases you would need to navigate manually to the + # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883 + @property + def weight_ih(self): + return self.layer_fw.cell.igates.weight + + @property + def weight_hh(self): + return self.layer_fw.cell.hgates.weight + + @property + def bias_ih(self): + return self.layer_fw.cell.igates.bias + + @property + def bias_hh(self): + return self.layer_fw.cell.hgates.bias + + @property + def weight_ih_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.igates.weight + + @property + def weight_hh_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.hgates.weight + + @property + def bias_ih_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.igates.bias + + @property + def bias_hh_reverse(self): + assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer' + return self.layer_bw.cell.hgates.bias + + +class LSTM(torch.nn.Module): + r"""A quantizable long short-term memory (LSTM). + + For the description and the argument types, please, refer to :class:`~torch.nn.LSTM` + + Attributes: + layers : instances of the `_LSTMLayer` + + .. note:: + To access the weights and biases, you need to access them per layer. + See examples below. + + Examples:: + + >>> import torch.nn.quantizable as nnqa + >>> rnn = nnqa.LSTM(10, 20, 2) + >>> input = torch.randn(5, 3, 10) + >>> h0 = torch.randn(2, 3, 20) + >>> c0 = torch.randn(2, 3, 20) + >>> output, (hn, cn) = rnn(input, (h0, c0)) + >>> # To get the weights: + >>> print(rnn.layers[0].weight_ih) + tensor([[...]]) + >>> print(rnn.layers[0].weight_hh) + AssertionError: There is no reverse path in the non-bidirectional layer + """ + _FLOAT_MODULE = torch.nn.LSTM + + def __init__(self, input_size: int, hidden_size: int, + num_layers: int = 1, bias: bool = True, + batch_first: bool = False, dropout: float = 0., + bidirectional: bool = False): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.dropout = float(dropout) + self.bidirectional = bidirectional + self.training = False # We don't want to train using this module + num_directions = 2 if bidirectional else 1 + + if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \ + isinstance(dropout, bool): + raise ValueError("dropout should be a number in range [0, 1] " + "representing the probability of an element being " + "zeroed") + if dropout > 0: + warnings.warn("dropout option for quantizable LSTM is ignored. " + "If you are training, please, use nn.LSTM version " + "followed by `prepare` step.") + if num_layers == 1: + warnings.warn("dropout option adds dropout after all but last " + "recurrent layer, so non-zero dropout expects " + "num_layers greater than 1, but got dropout={} " + "and num_layers={}".format(dropout, num_layers)) + + layers = [_LSTMLayer(self.input_size, self.hidden_size, + self.bias, batch_first=False, + bidirectional=self.bidirectional)] + for layer in range(1, num_layers): + layers.append(_LSTMLayer(self.hidden_size, self.hidden_size, + self.bias, batch_first=False, + bidirectional=self.bidirectional)) + self.layers = torch.nn.ModuleList(layers) + + def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None): + if self.batch_first: + x = x.transpose(0, 1) + + max_batch_size = x.size(1) + num_directions = 2 if self.bidirectional else 1 + if hidden is None: + zeros = torch.zeros(num_directions, max_batch_size, + self.hidden_size, dtype=torch.float, + device=x.device) + zeros.squeeze_(0) + if x.is_quantized: + zeros = torch.quantize_per_tensor(zeros, scale=1.0, + zero_point=0, dtype=x.dtype) + hxcx = [(zeros, zeros) for _ in range(self.num_layers)] + else: + hidden_non_opt = torch.jit._unwrap_optional(hidden) + if isinstance(hidden_non_opt[0], Tensor): + hx = hidden_non_opt[0].reshape(self.num_layers, num_directions, + max_batch_size, + self.hidden_size).unbind(0) + cx = hidden_non_opt[1].reshape(self.num_layers, num_directions, + max_batch_size, + self.hidden_size).unbind(0) + hxcx = [] + for idx in range(self.num_layers): + hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0))) + else: + hxcx = hidden_non_opt + + for idx in range(self.num_layers): + x, hxcx[idx] = self.layers[idx](x, hxcx[idx]) + + hx_list = [] + cx_list = [] + for idx in range(self.num_layers): + hx_list.append(hxcx[idx][0]) + cx_list.append(hxcx[idx][1]) + hx_tensor = torch.stack(hx_list) + cx_tensor = torch.stack(cx_list) + + # We are creating another dimension for bidirectional case + # need to collapse it + hx_tensor = hx_tensor.reshape(-1, *hx_tensor.shape[-2:]) + cx_tensor = cx_tensor.reshape(-1, *cx_tensor.shape[-2:]) + + if self.batch_first: + x = x.transpose(0, 1) + + return x, (hx_tensor, cx_tensor) + + def _get_name(self): + return 'QuantizableLSTM' + + @classmethod + def from_float(cls, other, qconfig=None): + assert isinstance(other, cls._FLOAT_MODULE) + assert (hasattr(other, 'qconfig') or qconfig) + observed = cls(other.input_size, other.hidden_size, other.num_layers, + other.bias, other.batch_first, other.dropout, + other.bidirectional) + observed.qconfig = getattr(other, 'qconfig', qconfig) + for idx in range(other.num_layers): + observed.layers[idx] = _LSTMLayer.from_float(other, idx, qconfig, + batch_first=False) + observed.eval() + observed = torch.quantization.prepare(observed, inplace=True) + return observed + + def from_observed(self, other): + return torch.quantization.convert(self, inplace=False, + remove_qconfig=True) diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 14ebcfcd8a6c..a9edb9ca32ed 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2315,6 +2315,9 @@ def log2(g, self): def prim_shape(g, self): return g.op('Shape', self) +def prim_max(g, self, other): + return g.op('Max', self, other) + def prim_data(g, self): return self @@ -2365,14 +2368,16 @@ def gather(g, self, dim, index, sparse_grad=False): def _var_mean(g, input, dim, unbiased, keepdim): if dim is None: mean = g.op("ReduceMean", input, keepdims_i=0) + t_mean = mean num_elements = numel(g, input) else: mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim) + t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1) redudced_dims = g.op("Shape", input) # dim could contain one or multiple dimensions redudced_dims = g.op("Gather", redudced_dims, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0) num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0) - sub_v = g.op("Sub", input, mean) + sub_v = g.op("Sub", input, t_mean) sqr_sub = g.op("Mul", sub_v, sub_v) keepdim_mean = 0 if dim is None else keepdim var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean) diff --git a/torch/quantization/_numeric_suite_fx.py b/torch/quantization/_numeric_suite_fx.py index eb1596832c4d..aeba95bb4e8f 100644 --- a/torch/quantization/_numeric_suite_fx.py +++ b/torch/quantization/_numeric_suite_fx.py @@ -21,7 +21,7 @@ def remove_qconfig_observer_fx(model): # remove activation post process act_post_process_removed_graph = Graph() - env = {} # type: Dict[str, Any] + env: Dict[str, Any] = {} modules = dict(model.named_modules()) diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py index f0ee8453557d..460b1c277a93 100644 --- a/torch/quantization/fake_quantize.py +++ b/torch/quantization/fake_quantize.py @@ -41,8 +41,7 @@ def calculate_qparams(self, **kwargs): pass @torch.jit.export - def enable_fake_quant(self, enabled=True): - # type: (bool) -> None + def enable_fake_quant(self, enabled: bool = True) -> None: self.fake_quant_enabled[0] = 1 if enabled else 0 @torch.jit.export @@ -50,8 +49,7 @@ def disable_fake_quant(self): self.enable_fake_quant(False) @torch.jit.export - def enable_observer(self, enabled=True): - # type: (bool) -> None + def enable_observer(self, enabled: bool = True) -> None: self.observer_enabled[0] = 1 if enabled else 0 @torch.jit.export diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py index 32d07c939695..2cc579f66087 100644 --- a/torch/quantization/observer.py +++ b/torch/quantization/observer.py @@ -390,6 +390,8 @@ def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine, def forward(self, x_orig): r"""Records the running minimum and maximum of ``x``.""" + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_val.dtype) min_val_cur, max_val_cur = torch._aminmax(x) @@ -463,6 +465,8 @@ def __init__(self, averaging_constant=0.01, dtype=torch.quint8, quant_max=quant_max) def forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_val.dtype) min_val = self.min_val @@ -532,6 +536,8 @@ def forward(self, x_orig): return self._forward(x_orig) def _forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape min_vals = self.min_vals max_vals = self.max_vals @@ -638,6 +644,8 @@ def __init__(self, averaging_constant=0.01, ch_axis=0, dtype=torch.quint8, self.averaging_constant = averaging_constant def forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_vals.dtype) min_vals = self.min_vals @@ -877,8 +885,9 @@ def _combine_histograms(self, orig_hist = orig_hist + interpolated_histogram.to(torch.float) return orig_hist - def forward(self, x_orig): - # type: (torch.Tensor) -> torch.Tensor + def forward(self, x_orig: torch.Tensor) -> torch.Tensor: + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() min_val = self.min_val max_val = self.max_val diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py index 8da4ad6bb182..2d91d8ab6b3e 100644 --- a/torch/quantization/qconfig.py +++ b/torch/quantization/qconfig.py @@ -3,6 +3,8 @@ from .fake_quantize import * import torch.nn as nn +from typing import Union + class QConfig(namedtuple('QConfig', ['activation', 'weight'])): """ Describes how to quantize a layer or a part of the network by providing @@ -109,3 +111,18 @@ def get_default_qat_qconfig(backend='fbgemm'): else: qconfig = default_qat_qconfig return qconfig + +def assert_valid_qconfig(qconfig: Union[QConfig, QConfigDynamic], + mod: torch.nn.Module) -> None: + is_conv_transpose_mod = ( + isinstance(mod, torch.nn.ConvTranspose1d) or + isinstance(mod, torch.nn.ConvTranspose2d) or + isinstance(mod, torch.nn.ConvTranspose3d)) + if is_conv_transpose_mod: + example_observer = qconfig.weight() + is_per_channel = ( + isinstance(example_observer, torch.quantization.PerChannelMinMaxObserver) or + isinstance(example_observer, torch.quantization.MovingAveragePerChannelMinMaxObserver) + ) + assert not is_per_channel, \ + 'Per channel weight observer is not supported yet for ConvTranspose{n}d.' diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py index a9417ecb80f3..77752a8af9c9 100644 --- a/torch/quantization/quantize.py +++ b/torch/quantization/quantize.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn import torch.nn.quantized as nnq +import torch.nn.quantizable as nnqa from torch.nn.intrinsic import _FusedModule from .quantization_mappings import ( @@ -49,6 +50,8 @@ def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None, module_qconfig = qconfig_dict.get(prefix, module_qconfig) module_qconfig = getattr(module, 'qconfig', module_qconfig) + torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module) + module.qconfig = module_qconfig for name, child in module.named_children(): module_prefix = prefix + '.' + name if prefix else name @@ -152,7 +155,10 @@ def insert_activation_post_process(m, special_act_post_process=None): elif needs_observation(child) and type(child) in custom_module_class_mapping: observed_child = custom_module_class_mapping[type(child)].from_float(child) setattr(module, name, observed_child) - insert_activation_post_process(observed_child) + # TODO: These are the modules that cannot be observed + # Once there are more, we should move them to a separate list + if custom_module_class_mapping[type(child)] != nnqa.LSTM: + insert_activation_post_process(observed_child) else: add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping) @@ -252,9 +258,12 @@ def _remove_activation_post_process(module): delattr(module, 'activation_post_process') # remove activation_post_proceess hook + handle_ids_to_remove = set() for handle_id, hook_fn in module._forward_hooks.items(): if hook_fn is _observer_forward_hook: - module._forward_hooks.pop(handle_id) + handle_ids_to_remove.add(handle_id) + for handle_id in handle_ids_to_remove: + module._forward_hooks.pop(handle_id) # TODO: rename to something more general def _remove_qconfig(module): diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 87d0baa895e8..119750396f1e 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -340,6 +340,77 @@ def sample_inputs_broadcast_to(op_info, device, dtype, requires_grad): requires_grad=requires_grad), shape)) for size, shape in test_cases) +def sample_inputs_stack(op_info, device, dtype, requires_grad): + return (SampleInput((make_tensor((S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + make_tensor((S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + make_tensor((S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad)), kwargs=dict(idx=0)),) + +def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad): + return (SampleInput((make_tensor((S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + make_tensor((S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + make_tensor((S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad))),) + +def sample_inputs_gather(op_info, device, dtype, requires_grad): + return (SampleInput((make_tensor((M, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, gather_variable((S, S), 1, M, True, device=device))), + SampleInput((make_tensor((M, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 1, gather_variable((M, S // 2), 0, S, True, device=device))), + SampleInput((make_tensor((), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, torch.tensor([0], dtype=torch.int64, device=device))), + SampleInput((make_tensor((S,), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, torch.tensor(0, dtype=torch.int64, device=device))), + SampleInput((make_tensor((), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, torch.tensor(0, dtype=torch.int64, device=device))), + ) + + +def sample_inputs_index_select(op_info, device, dtype, requires_grad): + return (SampleInput((make_tensor((S, S, S), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, index_variable(2, S, device=device))), + SampleInput((make_tensor((), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, torch.tensor([0], dtype=torch.int64, device=device))), + SampleInput((make_tensor((), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + 0, torch.tensor(0, dtype=torch.int64, device=device))), + ) + +def sample_movedim_moveaxis(op_info, device, dtype, requires_grad): + return (SampleInput((make_tensor((4, 3, 2, 1), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + (0, 1, 2, 3), (3, 2, 1, 0))), + SampleInput((make_tensor((4, 3, 2, 1), device, dtype, + low=None, high=None, + requires_grad=requires_grad), + (0, -1, -2, -3), (-3, -2, -1, -0)))) + def np_unary_ufunc_integer_promotion_wrapper(fn): # Wrapper that passes PyTorch's default scalar # type as an argument to the wrapped NumPy @@ -546,6 +617,30 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): return out +def sample_inputs_flip(op_info, device, dtype, requires_grad): + tensors = ( + make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad), + make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad) + ) + + dims = ((0, 1, 2), (0,), (0, 2), (-1,)) + + # On CUDA, `dims=()` errors out with IndexError + # Reference: https://github.com/pytorch/pytorch/issues/49982 + if device == 'cpu': + dims = dims + ((),) # type: ignore + + samples = [SampleInput(tensor, kwargs={'dims': dim}) for tensor, dim in product(tensors, dims)] + + return samples + +def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad): + tensors = ( + make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad), + make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad) + ) + return [SampleInput(tensor) for tensor in tensors] + # Operator database (sorted alphabetically) op_db: List[OpInfo] = [ # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952) @@ -717,7 +812,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.fftn', aten_name='fft_fftn', @@ -725,7 +820,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False, decorators=[precisionOverride( {torch.float: 1e-4, torch.cfloat: 1e-4})],), @@ -735,7 +830,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.rfft', aten_name='fft_rfft', @@ -743,7 +838,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.rfftn', aten_name='fft_rfftn', @@ -751,7 +846,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False, decorators=[precisionOverride({torch.float: 1e-4})],), SpectralFuncInfo('fft.ifft', @@ -760,7 +855,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.ifftn', aten_name='fft_ifftn', @@ -768,7 +863,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.ihfft', aten_name='fft_ihfft', @@ -776,7 +871,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and(torch.bool), default_test_dtypes=floating_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.irfft', aten_name='fft_irfft', @@ -784,7 +879,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.irfftn', aten_name='fft_irfftn', @@ -792,8 +887,26 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), + OpInfo('flip', + op=torch.flip, + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_flip, + test_inplace_grad=False, + supports_tensor_out=False), + OpInfo('fliplr', + op=torch.fliplr, + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_fliplr_flipud, + test_inplace_grad=False, + supports_tensor_out=False), + OpInfo('flipud', + op=torch.flipud, + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_fliplr_flipud, + test_inplace_grad=False, + supports_tensor_out=False), UnaryUfuncInfo('log', ref=np.log, domain=(0, float('inf')), @@ -1001,6 +1114,16 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', dtypes=[torch.bfloat16]), )), + UnaryUfuncInfo('rsqrt', + ref=lambda x: np.reciprocal(np.sqrt(x)), + domain=(0, float('inf')), + dtypes=all_types_and_complex_and(torch.bool), + dtypesIfCPU=all_types_and_complex_and(torch.bool), + dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half), + decorators=(precisionOverride({torch.half: 5e-2}),), + promotes_integers_to_float=True, + assert_autodiffed=True, + handles_complex_extremals=False), UnaryUfuncInfo('sqrt', ref=np.sqrt, domain=(0, float('inf')), @@ -1059,6 +1182,75 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): supports_tensor_out=False, sample_inputs_func=sample_inputs_pinverse, decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]), + OpInfo('gather', + dtypes=all_types_and_complex_and(torch.bool, torch.float16), + dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + sample_inputs_func=sample_inputs_gather), + OpInfo('index_select', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + skips=( + # https://github.com/pytorch/pytorch/issues/49707 + SkipInfo('TestCommon', 'test_variant_consistency_eager', + dtypes=[torch.float16, torch.bfloat16]), + SkipInfo('TestCommon', 'test_variant_consistency_jit', dtypes=[torch.float16, torch.bfloat16]), + ), + sample_inputs_func=sample_inputs_index_select), + OpInfo('stack', + # gradcheck expects the input arguments as a flat list + op=lambda *args, idx: torch.stack([*args], idx), + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + supports_tensor_out=False, + skips=( + SkipInfo('TestCommon', 'test_variant_consistency_jit', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)), + ), + sample_inputs_func=sample_inputs_stack), + OpInfo('hstack', + # gradcheck expects the input arguments as a flat list + op=lambda *args: torch.hstack([*args]), + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + supports_tensor_out=False, + skips=( + SkipInfo('TestCommon', 'test_variant_consistency_jit', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)), + ), + sample_inputs_func=sample_inputs_hstack_dstack_vstack), + OpInfo('vstack', + # gradcheck expects the input arguments as a flat list + op=lambda *args: torch.vstack([*args]), + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + supports_tensor_out=False, + skips=( + SkipInfo('TestCommon', 'test_variant_consistency_jit', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)), + ), + sample_inputs_func=sample_inputs_hstack_dstack_vstack), + OpInfo('dstack', + # gradcheck expects the input arguments as a flat list + op=lambda *args: torch.dstack([*args]), + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + supports_tensor_out=False, + skips=( + SkipInfo('TestCommon', 'test_variant_consistency_jit', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)), + ), + sample_inputs_func=sample_inputs_hstack_dstack_vstack), + OpInfo('movedim', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + supports_tensor_out=False, + sample_inputs_func=sample_movedim_moveaxis), + OpInfo('moveaxis', + dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16), + test_inplace_grad=False, + supports_tensor_out=False, + sample_inputs_func=sample_movedim_moveaxis), ] if TEST_SCIPY: @@ -1161,10 +1353,10 @@ def reference_sigmoid(x): spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)] sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True] -def index_variable(shape, max_indices): +def index_variable(shape, max_indices, device=torch.device('cpu')): if not isinstance(shape, tuple): shape = (shape,) - index = torch.rand(*shape).mul_(max_indices).floor_().long() + index = torch.rand(*shape, device=device).mul_(max_indices).floor_().long() return index @@ -1176,14 +1368,14 @@ def index_perm_variable(shape, max_indices): return index -def gather_variable(shape, index_dim, max_indices, duplicate=False): +def gather_variable(shape, index_dim, max_indices, duplicate=False, device=torch.device('cpu')): assert len(shape) == 2 assert index_dim < 2 batch_dim = 1 - index_dim - index = torch.LongTensor(*shape) + index = torch.zeros(*shape, dtype=torch.long, device=device) for i in range(shape[index_dim]): index.select(index_dim, i).copy_( - torch.randperm(max_indices)[:shape[batch_dim]]) + torch.randperm(max_indices, device=device)[:shape[batch_dim]]) if duplicate: index.select(batch_dim, 0).copy_(index.select(batch_dim, 1)) return index @@ -1387,13 +1579,6 @@ def method_tests(): ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)), ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'), ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'), - ('flip', (S, S, S), ([0],), 'd0'), - ('flip', (S, S, S), ([0, 1, 2],), 'd012'), - ('flip', (S, S, S), ([0, 2],), 'd02'), - ('flip', (S, S, S), ([2, 0],), 'd20'), - ('flip', (S, S, S), ([-1],), 'neg_d'), - ('fliplr', (S, S, S), ()), - ('flipud', (S, S, S), ()), ('roll', (S, S, S), (0, 0), 'd0'), ('roll', (S, S, S), (1, 2), 'd12'), ('roll', (S, S, S), (0, 2,), 'd02'), @@ -1466,6 +1651,10 @@ def method_tests(): ('ceil', (), NO_ARGS, 'scalar', (True,)), ('rad2deg', (S, S, S), NO_ARGS), ('deg2rad', (S, S, S), NO_ARGS), + # Removing the 'rsqrt' entries leads to failure in + # test_index_fill_variable_dim_* + # TODO: Remove when fixed. + # Reference: https://github.com/pytorch/pytorch/issues/48230 ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)), ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)), ('rsqrt', torch.rand(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)), @@ -1865,10 +2054,10 @@ def method_tests(): ('diagonal', (M, M, M), (1, 1, 2), '3d_1'), ('diagonal', (M, M, M), (2, 0, 1), '3d_2'), ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'), - ('tile', (S, S, S), ([S, S, S, S],), 'more_reps_dims', (False,)), - ('tile', (S, S, S), ([S, S, S],), 'same_reps_dims', (False,)), - ('tile', (S, S, S), ([S, M],), 'less_reps_dims', (False,)), - ('tile', (S, S, S), ([S, S, 0],), 'zero_rep_dim', (False,)), + ('tile', (2, 2), ([2, 2, 2],), 'more_reps_dims', (False,)), + ('tile', (2, 2), ([2, 2],), 'same_reps_dims', (False,)), + ('tile', (2, 2), ([2, 3],), 'less_reps_dims', (False,)), + ('tile', (2, 2, 2), ([2, 2, 0],), 'zero_rep_dim', (False,)), ('tile', (), ([S, S, S],), 'empty_tensor', (False,)), ('tril', (M, M), NO_ARGS), ('tril', (M, M), (2,), 'idx'), @@ -1883,9 +2072,6 @@ def method_tests(): ('trace', (M, M), NO_ARGS), ('cross', (S, 3), ((S, 3),)), ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'), - ('index_select', (S, S, S), (0, index_variable(2, S)), 'dim', (), [0]), - ('index_select', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_mixed_dim', (), [0]), - ('index_select', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_dim', (), [0]), ('index_add', (S, S), (0, index_variable(2, S), (2, S)), 'dim', (), [0]), ('index_add', (), (0, torch.tensor([0], dtype=torch.int64), (1,)), 'scalar_input_dim', (), [0]), ('index_add', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalar_all_dim', (), [0]), @@ -2084,11 +2270,6 @@ def method_tests(): ('tensor_split', (S, S, S), (3, 1), 'sections_dim', (False,), [1]), ('tensor_split', (S, S, S), ([2, 4],), 'indices', (False,)), ('tensor_split', (S, S, S), ([2, 4], 1), 'indices_dim', (False,), [1]), - ('gather', (M, S), (0, gather_variable((S, S), 1, M, True)), 'dim0', (), [0]), - ('gather', (M, S), (1, gather_variable((M, S // 2), 0, S, True)), 'dim1', (), [0]), - ('gather', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_input', (), [0]), - ('gather', (S,), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_index', (), [0]), - ('gather', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_both', (), [0]), ('scatter', (M, S), (0, gather_variable((S, S), 1, M), (S, S)), 'dim0', (), [0]), ('scatter', (M, S), (1, gather_variable((M, S // 2), 0, S), (M, S // 2)), 'dim1', (), [0]), ('scatter', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalartensor_all_dim0', (), [0]), diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index c588f69c2875..714361497d94 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -2988,7 +2988,7 @@ def fractional_max_pool3d_test(test_case): .scale_factor(std::vector({3., 3., 3.})) .mode(torch::kTrilinear) .align_corners(false)''', - input_size=(1, 2, 3, 4, 4), + input_size=(1, 2, 3, 4, 5), fullname='interpolate_trilinear_scale_3d', # See https://github.com/pytorch/pytorch/issues/5006 precision=3e-4, diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py index 243cd964b96d..f14556597128 100644 --- a/torch/testing/_internal/common_quantized.py +++ b/torch/testing/_internal/common_quantized.py @@ -102,6 +102,35 @@ def _calculate_dynamic_per_channel_qparams(X, dtype): return scale, zero_point +def _snr(x, x_hat): + """Calculates the signal to noise ratio and returns the signal and noise + power, as well as the SNR in dB. + If the input is a list/tuple this function is called recursively on each + element. The result will have the same nested structure as the inputs. + + Args: + x, x_hat: Either a tensor or a nested list/tuple of tensors. + Returns: + signal, noise, SNR(in dB): Either floats or a nested list of floats + """ + if isinstance(x, (list, tuple)): + assert(len(x) == len(x_hat)) + res = [] + for idx in range(len(x)): + res.append(_snr(x[idx], x_hat[idx])) + return res + if x_hat.is_quantized: + x_hat = x_hat.dequantize() + if x.is_quantized: + x = x.dequantize() + noise = (x - x_hat).norm() + if noise == 0: + return 0.0, float('inf'), float('inf') + signal = x.norm() + snr = signal / noise + snr_db = 20 * snr.log10() + return signal, noise, snr_db + @contextmanager def override_quantized_engine(qengine): previous = torch.backends.quantized.engine diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index bea572722ae6..9f70551eb3b2 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1386,19 +1386,26 @@ def assertExpectedStripMangled(self, s, subname=None): s = re.sub(r'__torch__[^ ]+', '', s) self.assertExpected(s, subname) - # returns captured stderr + # run code in subprocess and capture exceptions. @staticmethod - def runWithPytorchAPIUsageStderr(code): + def run_process_no_exception(code, env=None): import subprocess - env = os.environ.copy() - env["PYTORCH_API_USAGE_STDERR"] = "1" - pipes = subprocess.Popen( + popen = subprocess.Popen( [sys.executable, '-c', code], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - return pipes.communicate()[1].decode('ascii') + (stdout, stderr) = popen.communicate() + return (stdout, stderr) + + # returns captured stderr + @staticmethod + def runWithPytorchAPIUsageStderr(code): + env = os.environ.copy() + env["PYTORCH_API_USAGE_STDERR"] = "1" + (stdout, stderr) = TestCase.run_process_no_exception(code, env=env) + return stderr.decode('ascii') def download_file(url, binary=True): diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 8eec8100270b..ede2471aa3a2 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -1335,7 +1335,11 @@ def convert_remote_to_local(event_name): for event in events if convert_remote_to_local(event.name) in EXPECTED_REMOTE_EVENTS ] - self.assertEqual(remote_events_list, EXPECTED_REMOTE_EVENTS) + self.assertEqual( + set(remote_events_list), + set(EXPECTED_REMOTE_EVENTS), + f"Mismatch between profiled events: {set(remote_events_list)} and expected events: {set(EXPECTED_REMOTE_EVENTS)}", + ) @dist_init def test_profiler_remote_events_profiled(self): @@ -1579,8 +1583,8 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function scope_event = get_function_event(events, "foo") # Since RPC call is within the scope, its CPU interval should be # contained within foo's interval. - self.assertTrue(scope_event.time_range.start < rpc_event.time_range.start) - self.assertTrue(scope_event.time_range.end > rpc_event.time_range.end) + self.assertLessEqual(scope_event.time_range.start, rpc_event.time_range.start) + self.assertGreaterEqual(scope_event.time_range.end, rpc_event.time_range.end) # the sender, dest worker, function run, and type of RPC should all # be recorded. self_worker_name = worker_name(self.rank) @@ -1776,7 +1780,13 @@ def _assert_top_level_events(self, process_global_events, expected_top_level_eve if time_range.start > last_end_time: top_level_event_names.append(event_name) last_end_time = time_range.end - self.assertEqual(sorted(top_level_event_names), sorted(expected_top_level_event_names)) + top_level_event_names = sorted(top_level_event_names) + expected_top_level_event_names = sorted(expected_top_level_event_names) + self.assertEqual( + top_level_event_names, + expected_top_level_event_names, + f"Expected events {expected_top_level_event_names}, but got {top_level_event_names}", + ) @dist_init def test_server_process_global_profiler(self): @@ -1799,9 +1809,12 @@ def test_server_process_global_profiler(self): outer_profile_rref.rpc_sync().__exit__(None, None, None) inner_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (inner_profile_rref,)) - self._assert_top_level_events(inner_events, ['aten::sub']) + expected_inner_events = ['aten::sub'] + expected_outer_events = expected_inner_events + ['aten::add'] + + self._assert_top_level_events(inner_events, expected_inner_events) outer_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (outer_profile_rref,)) - self._assert_top_level_events(outer_events, ['aten::add', 'aten::sub']) + self._assert_top_level_events(outer_events, expected_outer_events) inner_profile_rref.rpc_sync().key_averages() outer_profile_rref.rpc_sync().key_averages() diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py index d1639d20adba..adc480793d82 100755 --- a/torch/utils/hipify/hipify_python.py +++ b/torch/utils/hipify/hipify_python.py @@ -782,7 +782,9 @@ def repl(m): os.path.relpath(header_filepath, output_directory), all_files, includes, stats, hip_clang_launch, is_pytorch_extension, clean_ctx, show_progress) - return templ.format(os.path.relpath(HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"], header_dir)) + value = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"] + assert value is not None + return templ.format(os.path.relpath(value, header_dir)) return m.group(0) return repl