From e429d05015bfa878f2ae660a1e0dd96b51d743d5 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 6 Dec 2020 10:21:08 -0800 Subject: [PATCH 001/250] Fixing error: "member may not be initialized" due to constexpr at Windows (#48836) Summary: Fixes https://github.com/pytorch/pytorch/issues/48835 Fixes https://github.com/pytorch/pytorch/issues/48716 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48836 Reviewed By: malfet Differential Revision: D25335829 Pulled By: datumbox fbshipit-source-id: 807182e9afa3bb314dbb85bfcd9589a2c319a7db --- torch/csrc/jit/ir/ir.cpp | 4 ++-- torch/csrc/jit/ir/ir.h | 4 ++-- torch/csrc/jit/serialization/pickler.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index fe79091c946f..ceb0fd1dbfcf 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -2061,8 +2061,8 @@ TypePtr NamedValue::type() const { } } -constexpr Symbol ProfileOp::Kind; -constexpr Symbol ProfileOptionalOp::Kind; +const Symbol ProfileOp::Kind = ::c10::prim::profile; +const Symbol ProfileOptionalOp::Kind = ::c10::prim::profile_optional; OperatorSet::OperatorSet(std::initializer_list sig_literals) { for (const char* sig : sig_literals) { diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index 9db2dbdf2516..64c8031bd601 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -1326,7 +1326,7 @@ inline const Graph* Value::owningGraph() const { /************* All nodes not required to be defined before Graph **************/ struct ProfileOp : public Node { - static constexpr Symbol Kind = ::c10::prim::profile; + static const Symbol Kind; ProfileOp(Graph* graph, std::function&)> callback) : Node(graph, ::c10::prim::profile), callback_(std::move(callback)) {} @@ -1346,7 +1346,7 @@ struct ProfileOp : public Node { }; struct TORCH_API ProfileOptionalOp : public Node { - static constexpr Symbol Kind = ::c10::prim::profile_optional; + static const Symbol Kind; ProfileOptionalOp( Graph* graph, std::function&)> callback) diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h index 4473b0cb50dd..6a557e6e53f3 100644 --- a/torch/csrc/jit/serialization/pickler.h +++ b/torch/csrc/jit/serialization/pickler.h @@ -209,7 +209,7 @@ class TORCH_API Pickler { // the left of a '::', its type cannot be deduced by the compiler so one must // explicitly instantiate the template, i.e. push(int) works, push(int) // does not) - static constexpr size_t kBufferSize = 256; + static CONSTEXPR_EXCEPT_WIN_CUDA size_t kBufferSize = 256; template void push(typename std::common_type::type value) { const char* begin = reinterpret_cast(&value); From 19f4c5110e8bcad5e7e75375194262fca0a6293a Mon Sep 17 00:00:00 2001 From: Liang Liu Date: Sun, 6 Dec 2020 18:08:07 -0800 Subject: [PATCH 002/250] Add another torch::jit::load API to load PyTorch model with shared_ptr PyTorchStreamReader input (#48802) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48802 Current torch::jit::load API only supports unique_ptr ReadAdaptInterface input, but for some cases, torch::jit::load may not be the only consumer of the reader adapter. This diff enables an overload of torch::jit::load to load shared_ptr PyTorchStreamReader. Reviewed By: malfet, houseroad Differential Revision: D25241904 fbshipit-source-id: aa403bac9ed820cc0e94342aebfe524a1d5bf913 --- caffe2/serialize/inline_container.cc | 2 +- caffe2/serialize/inline_container.h | 4 ++-- torch/csrc/jit/serialization/import.cpp | 13 +++++-------- torch/csrc/jit/serialization/import.h | 4 ++-- torch/csrc/jit/serialization/import_legacy.cpp | 6 +++--- torch/csrc/jit/serialization/import_legacy.h | 2 +- 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc index 7928d5e3de86..3d9701274ba3 100644 --- a/caffe2/serialize/inline_container.cc +++ b/caffe2/serialize/inline_container.cc @@ -65,7 +65,7 @@ PyTorchStreamReader::PyTorchStreamReader(std::istream* in) } PyTorchStreamReader::PyTorchStreamReader( - std::unique_ptr in) + std::shared_ptr in) : ar_(std::make_unique()), in_(std::move(in)) { init(); } diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h index 2e841d0ad824..ee7e971344ea 100644 --- a/caffe2/serialize/inline_container.h +++ b/caffe2/serialize/inline_container.h @@ -156,7 +156,7 @@ class CAFFE2_API PyTorchStreamReader final { public: explicit PyTorchStreamReader(const std::string& file_name); explicit PyTorchStreamReader(std::istream* in); - explicit PyTorchStreamReader(std::unique_ptr in); + explicit PyTorchStreamReader(std::shared_ptr in); // return dataptr, size std::tuple getRecord(const std::string& name); @@ -180,7 +180,7 @@ class CAFFE2_API PyTorchStreamReader final { std::unique_ptr ar_; std::string archive_name_; std::string archive_name_plus_slash_; - std::unique_ptr in_; + std::shared_ptr in_; int64_t version_; }; diff --git a/torch/csrc/jit/serialization/import.cpp b/torch/csrc/jit/serialization/import.cpp index 3956d0283487..ed5dde0b08b0 100644 --- a/torch/csrc/jit/serialization/import.cpp +++ b/torch/csrc/jit/serialization/import.cpp @@ -108,7 +108,7 @@ class ScriptModuleDeserializer final { public: ScriptModuleDeserializer( std::shared_ptr cu, - std::unique_ptr reader) + std::shared_ptr reader) : compilation_unit_(std::move(cu)), reader_(std::move(reader)), source_importer_( @@ -128,7 +128,7 @@ class ScriptModuleDeserializer final { IValue readArchive(const std::string& archive_name); std::shared_ptr compilation_unit_; - std::unique_ptr reader_; + std::shared_ptr reader_; c10::optional device_; std::vector constants_table_; SourceImporter source_importer_; @@ -175,7 +175,6 @@ IValue ScriptModuleDeserializer::readArchive(const std::string& archive_name) { return obj; } }; - return readArchiveAndTensors( archive_name, type_resolver, obj_loader, device_, *reader_.get()); } @@ -257,8 +256,7 @@ Module ScriptModuleDeserializer::deserialize( } if (reader_->hasRecord("model.json")) { #if !defined(C10_MOBILE) && !defined(C10_DISABLE_LEGACY_IMPORT) - return torch::jit::LEGACY_deserialize( - compilation_unit_, std::move(reader_), device_); + return torch::jit::LEGACY_deserialize(compilation_unit_, reader_, device_); #else AT_ERROR("Legacy model format is not supported on mobile."); #endif @@ -271,7 +269,6 @@ Module ScriptModuleDeserializer::deserialize( rewriteQuantizedConvForBC(m); return m; } - } // namespace Module import_ir_module( @@ -323,7 +320,7 @@ Module load( } Module load( - std::unique_ptr rai, + std::shared_ptr rai, c10::optional device, ExtraFilesMap& extra_files) { // Verify that we're loading a zip archive and not a torch.save pickle archive @@ -347,7 +344,7 @@ Module load( " produced by `torch.jit.save()`"); } - auto reader = torch::make_unique(std::move(rai)); + auto reader = std::make_shared(std::move(rai)); auto cu = std::make_shared(); ScriptModuleDeserializer deserializer(std::move(cu), std::move(reader)); diff --git a/torch/csrc/jit/serialization/import.h b/torch/csrc/jit/serialization/import.h index 543a1ca32aaf..cbfb765a6350 100644 --- a/torch/csrc/jit/serialization/import.h +++ b/torch/csrc/jit/serialization/import.h @@ -55,13 +55,13 @@ TORCH_API Module load( c10::optional device = c10::nullopt, ExtraFilesMap& extra_files = default_extra_files); -/// Loads a serialized `Module` from the given `rai`. +/// Loads a serialized `Module` from the given shared_ptr `rai`. /// /// The reader adapter, which is for customized input stream, must contain a /// serialized `Module`, exported either via `ScriptModule.save()` in /// Python or `torch::jit::ExportModule` in C++. TORCH_API Module load( - std::unique_ptr rai, + std::shared_ptr rai, c10::optional device = c10::nullopt, ExtraFilesMap& extra_files = default_extra_files); diff --git a/torch/csrc/jit/serialization/import_legacy.cpp b/torch/csrc/jit/serialization/import_legacy.cpp index 7a8279e0199c..40e035b82090 100644 --- a/torch/csrc/jit/serialization/import_legacy.cpp +++ b/torch/csrc/jit/serialization/import_legacy.cpp @@ -40,7 +40,7 @@ class ScriptModuleDeserializer final { public: ScriptModuleDeserializer( std::shared_ptr cu, - std::unique_ptr reader, + std::shared_ptr reader, const c10::optional& device) : compilation_unit_(std::move(cu)), reader_(std::move(reader)), @@ -76,7 +76,7 @@ class ScriptModuleDeserializer final { std::shared_ptr sourceLoader(const std::string& qualifier); std::shared_ptr compilation_unit_; - std::unique_ptr reader_; + std::shared_ptr reader_; c10::optional device_; // Legacy only tensor can be a constant. std::vector constant_table_; @@ -383,7 +383,7 @@ Module ScriptModuleDeserializer::LEGACY_convertModule( Module LEGACY_deserialize( std::shared_ptr cu, - std::unique_ptr reader, + std::shared_ptr reader, const c10::optional& device) { ScriptModuleDeserializer deserializer( std::move(cu), std::move(reader), device); diff --git a/torch/csrc/jit/serialization/import_legacy.h b/torch/csrc/jit/serialization/import_legacy.h index 64f8a7da1968..a26182810959 100644 --- a/torch/csrc/jit/serialization/import_legacy.h +++ b/torch/csrc/jit/serialization/import_legacy.h @@ -16,7 +16,7 @@ struct CompilationUnit; // Deserializes a model in legacy format. Module LEGACY_deserialize( std::shared_ptr cu, - std::unique_ptr reader, + std::shared_ptr reader, const c10::optional& device); } // namespace jit From a39398b9e5d528e4a6ca293f1703833932f0d9b2 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Sun, 6 Dec 2020 23:38:15 -0800 Subject: [PATCH 003/250] CUDA BF16 norm (#48806) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/48806 Reviewed By: mruberry Differential Revision: D25358465 Pulled By: ngimel fbshipit-source-id: 1a2afd86f39e96db0754d04bf81de045b1e1235c --- aten/src/ATen/native/cuda/ReduceNormKernel.cu | 2 -- test/test_torch.py | 14 ++++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu index 39a355a96756..a857dbc52b8a 100644 --- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu @@ -40,14 +40,12 @@ static void norm_kernel_cuda(TensorIterator& iter, Scalar p) { // type promotion that does cast and reduction in a single kernel return norm_kernel_cuda_impl(iter, p); } - #ifdef __HIP_PLATFORM_HCC__ else if(iter.dtype() == kBFloat16) { return norm_kernel_cuda_impl(iter, p); } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel return norm_kernel_cuda_impl(iter, p); } - #endif AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cuda", [&]() { norm_kernel_cuda_impl(iter, p); }); diff --git a/test/test_torch.py b/test/test_torch.py index fde60ca4174f..2d181c3b9400 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6328,10 +6328,6 @@ def test_copy_broadcast(self, device) -> None: _float_types_no_half = [torch.float, torch.double] -# _float_types2 adds bfloat16 type to _float_types only on ROCm. Should eventually be unified -# with _float_types when bfloat16 bringup is complete on all platforms -_float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types - _signed_types = [ torch.half, torch.bfloat16, torch.float, torch.double, torch.int8, torch.short, torch.int, torch.long @@ -6689,10 +6685,12 @@ def inner(self, device, dtype): ('narrow', '', _small_3d, lambda t, d: [1, 3, 2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('narrow', 'neg_dim', _small_3d, lambda t, d: [-1, 3, 2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('nonzero', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), - ('norm', '', _small_3d, lambda t, d: [], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False), - ('norm', '3_norm', _small_3d, lambda t, d: [3], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False), - ('norm', '3_norm_dim', _small_3d, lambda t, d: [3, 0], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False), - ('norm', '3_norm_neg_dim', _small_3d, lambda t, d: [3, -2], 1e-1, 1e-1, 1e-5, _float_types2, _cpu_types, False), + ('norm', '', _small_3d, lambda t, d: [], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False), + ('norm', '3_norm', _small_3d, lambda t, d: [3], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), _cpu_types, False), + ('norm', '3_norm_dim', _small_3d, lambda t, d: [3, 0], 1e-1, 1e-1, 1e-5, + torch.testing.get_all_fp_dtypes(), _cpu_types, False), + ('norm', '3_norm_neg_dim', _small_3d, lambda t, d: [3, -2], 1e-1, 1e-1, 1e-5, + torch.testing.get_all_fp_dtypes(), _cpu_types, False), ('new_ones', '', _small_3d, lambda t, d: [1, 2, 3, 4, 5], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('permute', '', _new_t((1, 2, 3, 4)), lambda t, d: [2, 1, 3, 0], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('put_', '', _new_t((2, 5, 3)), From 00f01791a37bb88e4d2140ffb3eb3eef1754786f Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 7 Dec 2020 07:41:17 -0800 Subject: [PATCH 004/250] [Caffe2]Add more error message in ComputeBinaryBroadcastForwardDims Summary: Add more error message in ComputeBinaryBroadcastForwardDims Test Plan: buck test mode/opt caffe2/caffe2/python/operator_test:gather_ranges_op_test buck test mode/opt caffe2/caffe2/python/operator_test:reduce_ops_test buck test mode/opt caffe2/caffe2/python/operator_test:elementwise_ops_test Reviewed By: BIT-silence Differential Revision: D24949525 fbshipit-source-id: 762d913a6615a6394072f5bebbcb5cc36f0b8603 --- caffe2/operators/elementwise_ops_utils.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/caffe2/operators/elementwise_ops_utils.cc b/caffe2/operators/elementwise_ops_utils.cc index 5bb6c768ea3e..0f76a1b35aa4 100644 --- a/caffe2/operators/elementwise_ops_utils.cc +++ b/caffe2/operators/elementwise_ops_utils.cc @@ -53,7 +53,10 @@ std::vector ComputeBinaryBroadcastForwardDims( for (; i >= 0 && j >= 0; --k) { const int A_dim = A_dims[i]; const int B_dim = B_dims[j]; - CAFFE_ENFORCE(A_dim == B_dim || A_dim == 1 || B_dim == 1); + CAFFE_ENFORCE( + A_dim == B_dim || A_dim == 1 || B_dim == 1, + "A_dim: ", A_dim , ",B_dim: ", B_dim + ); if (A_dim == 0 || B_dim == 0) { C_dims[k] = 0; } else { From 1febd2225b03516340a4c799c4920e0d3dc82417 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 7 Dec 2020 08:05:25 -0800 Subject: [PATCH 005/250] Add explicit cast to cuda_atomic_ops_test.cu (#48886) Summary: Should fix linking error reported in https://github.com/pytorch/pytorch/issues/48870 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48886 Reviewed By: walterddr Differential Revision: D25356601 Pulled By: malfet fbshipit-source-id: 25282d4606251b27d047917f096868ddb662a723 --- aten/src/ATen/test/cuda_atomic_ops_test.cu | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/test/cuda_atomic_ops_test.cu b/aten/src/ATen/test/cuda_atomic_ops_test.cu index 285623349e52..920a72452916 100644 --- a/aten/src/ATen/test/cuda_atomic_ops_test.cu +++ b/aten/src/ATen/test/cuda_atomic_ops_test.cu @@ -11,7 +11,7 @@ template __global__ void addition_test_kernel(T * a, T * sum) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int idx = (tid) % arraysize; - + gpuAtomicAdd(&sum[idx], a[idx]); } @@ -19,7 +19,7 @@ template __global__ void mul_test_kernel(T * a, T * sum) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int idx = (tid) % arraysize; - + gpuAtomicMul(&sum[idx], a[idx]); } @@ -29,7 +29,7 @@ void test_atomic_add() { dim3 dimGrid(1, 1); T *a, *sum, *answer, *ad, *sumd; - + a = (T*)malloc(arraysize * sizeof(T)); sum = (T*)malloc(arraysize * sizeof(T)); answer = (T*)malloc(arraysize * sizeof(T)); @@ -42,7 +42,7 @@ void test_atomic_add() { cudaMalloc((void**)&ad, arraysize * sizeof(T)); cudaMalloc((void**)&sumd, arraysize * sizeof(T)); - + cudaMemcpy(ad, a, arraysize * sizeof(T), cudaMemcpyHostToDevice); cudaMemcpy(sumd, sum, arraysize * sizeof(T), cudaMemcpyHostToDevice); @@ -67,7 +67,7 @@ void test_atomic_mul() { dim3 dimGrid(1, 1); T *a, *sum, *answer, *ad, *sumd; - + a = (T*)malloc(arraysize * sizeof(T)); sum = (T*)malloc(arraysize * sizeof(T)); answer = (T*)malloc(arraysize * sizeof(T)); @@ -75,12 +75,12 @@ void test_atomic_mul() { for (int i = 0; i < arraysize; ++i) { a[i] = 2; sum[i] = 2; - answer[i] = pow(sum[i], factor); + answer[i] = pow(sum[i], static_cast(factor)); } cudaMalloc((void**)&ad, arraysize * sizeof(T)); cudaMalloc((void**)&sumd, arraysize * sizeof(T)); - + cudaMemcpy(ad, a, arraysize * sizeof(T), cudaMemcpyHostToDevice); cudaMemcpy(sumd, sum, arraysize * sizeof(T), cudaMemcpyHostToDevice); @@ -105,7 +105,7 @@ TEST(TestAtomicOps, TestAtomicAdd) { test_atomic_add(); test_atomic_add(); test_atomic_add(); - + test_atomic_add(); test_atomic_add(); test_atomic_add(); From 8bc6023d7a822ea6936b7460027f29558149008d Mon Sep 17 00:00:00 2001 From: Guilherme Leobas Date: Mon, 7 Dec 2020 08:21:42 -0800 Subject: [PATCH 006/250] Add type annotations to torch.onnx.* modules (#48782) Summary: Fixes https://github.com/pytorch/pytorch/issues/45215 This is a follow up PR of https://github.com/pytorch/pytorch/issues/45258 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48782 Reviewed By: heitorschueroff Differential Revision: D25304229 Pulled By: ezyang fbshipit-source-id: b01b21ddbf86f908ca08173e68b81fb25851bc81 --- mypy.ini | 24 ------------ torch/_C/__init__.pyi.in | 68 ++++++++++++++++++++++++++++++++- torch/_C/_onnx.pyi | 1 + torch/onnx/symbolic_helper.py | 23 ++++++----- torch/onnx/symbolic_opset8.py | 2 +- torch/onnx/symbolic_opset9.py | 9 +++-- torch/onnx/symbolic_registry.py | 5 ++- torch/onnx/utils.py | 24 +++++++----- 8 files changed, 105 insertions(+), 51 deletions(-) diff --git a/mypy.ini b/mypy.ini index f4b37f15a820..0b9f5497162c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -143,30 +143,6 @@ ignore_errors = True [mypy-torch.nn.intrinsic.qat.modules.conv_fused] ignore_errors = True -[mypy-torch.onnx.operators] -ignore_errors = True - -[mypy-torch.onnx.symbolic_opset8] -ignore_errors = True - -[mypy-torch.onnx.symbolic_opset9] -ignore_errors = True - -[mypy-torch.onnx.symbolic_opset11] -ignore_errors = True - -[mypy-torch.onnx.symbolic_caffe2] -ignore_errors = True - -[mypy-torch.onnx.symbolic_helper] -ignore_errors = True - -[mypy-torch.onnx.symbolic_registry] -ignore_errors = True - -[mypy-torch.onnx.utils] -ignore_errors = True - [mypy-torch.multiprocessing.pool] ignore_errors = True diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index cbb5b2452e21..1452718ed793 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -165,7 +165,7 @@ def wait(fut: Future) -> Any: ... def _collect_all(futures: List[Future]) -> Future: ... def unify_type_list(types: List[JitType]) -> JitType: ... -def _freeze_module(module: ScriptModule, preserved_attrs: List[str], freeze_interfaces: _bool = True) -> ScriptModule: ... +def _freeze_module(module: ScriptModule, preserved_attrs: List[str] = [], freeze_interfaces: _bool = True) -> ScriptModule: ... def _is_tracing() -> _bool: ... def _jit_init() -> _bool: ... def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ... @@ -217,6 +217,8 @@ def _jit_get_trigger_value(trigger_name: str) -> _int: ... # Defined in torch/csrc/jit/python/script_init.cpp ResolutionCallback = Callable[[str], Callable[..., Any]] +# Defined in torch/csrc/jit/python/script_init.cpp +# and torch/csrc/jit/python/init.cpp def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ... def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ... def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ... @@ -246,6 +248,54 @@ def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallb def _create_module_with_type(ty: JitType) -> ScriptModule: ... def _run_emit_module_hook(m: ScriptModule): ... def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def, new_name: str) -> Def: ... + +def _jit_pass_lower_all_tuples(graph: Graph) -> None: ... +def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ... +def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ... +def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], onnx_shape_inference: _bool = False) -> None: ... +def _jit_pass_fixup_onnx_loop_node_inputs(n: Node) -> None: ... +def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ... +def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ... +def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ... +def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ... +def _jit_pass_fuse_addmm(graph: Graph) -> None: ... +def _jit_pass_onnx_preprocess(graph: Graph) -> None: ... +def _jit_pass_onnx_prepare_inplace_ops_for_onnx(graph: Graph) -> None: ... +def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ... +def _jit_pass_onnx_remove_print(graph: Graph) -> None: ... +def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ... +def _jit_pass_onnx_unpack_quantized_weights( + graph: Graph, + paramsDict: Dict[str, IValue] +) -> Dict[str, IValue]: ... +def _jit_pass_onnx_quantization_insert_permutes( + graph: Graph, + paramsDict: Dict[str, IValue] +) -> Dict[str, IValue]: ... +def _jit_pass_custom_pattern_based_rewrite_graph(pattern: str, fused_node_name: str, graph: Graph) -> None: ... +def _jit_pass_erase_number_types(graph: Graph) -> None: ... +def _jit_pass_onnx(graph: Graph, _jit_pass_onnx: _onnx.OperatorExportTypes) -> Graph: ... +def _jit_pass_onnx_scalar_type_analysis(graph: Graph) -> None: ... +def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size: _bool) -> None: ... +def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ... +def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ... +def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ... +def _jit_pass_inline_fork_wait(graph: Graph) -> None: ... +def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... +def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ... +def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... +def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ... +def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ... +def _jit_decay_packed_param_input_types(graph: Graph) -> None: ... +def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ... +def _jit_pass_onnx_block( + old_block: Block, + new_block: Block, + operator_export_type: _onnx.OperatorExportTypes, + env: Dict[Value, Value] +) -> None: ... +def _jit_pass_fixup_onnx_controlflow_node(n: Node, opset_version: _int) -> Node: ... + def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ... def _jit_script_compile_overload( qualname: str, @@ -281,8 +331,18 @@ def import_ir_module_from_buffer( extra_files: Dict[str, Any] ) -> ScriptModule: ... +def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ... +def _check_onnx_proto(proto: str) -> None: ... +def _propagate_and_assign_input_shapes( + graph: Graph, + inputs: Tuple[Tensor, ...], + with_grad: _bool, + propagate: _bool +) -> Graph: ... + # Defined in torch/torch/csrc/jit/ir/ir.h class Graph: + def eraseInput(self, i: _int) -> None: ... ... # Defined in torch/csrc/jit/ir/ir.h @@ -366,8 +426,8 @@ class ScriptFunction: def qualified_name(self) -> str: ... class ScriptMethod: + graph: Graph ... - class ModuleDict: def __init__(self, mod: ScriptModule) -> None: ... def items(self) -> List[Tuple[str, Any]]: ... @@ -378,6 +438,10 @@ class ParameterDict: class BufferDict: def __init__(self, mod: ScriptModule) -> None: ... +# Defined in torch/csrc/jit/api/module.h +class Module: + ... + # Defined in torch/csrc/Module.cpp def _initExtension(shm_manager_path: str) -> None: ... # THPModule_initExtension def _autograd_init() -> _bool: ... # THPAutograd_initExtension diff --git a/torch/_C/_onnx.pyi b/torch/_C/_onnx.pyi index 51f16566ce6c..7ab3cd9c567d 100644 --- a/torch/_C/_onnx.pyi +++ b/torch/_C/_onnx.pyi @@ -29,6 +29,7 @@ class OperatorExportTypes(Enum): ONNX_ATEN = ... ONNX_ATEN_FALLBACK = ... RAW = ... + ONNX_FALLTHROUGH = ... class TrainingMode(Enum): EVAL = ... diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 5e9430f995f8..8fd8ce3ea760 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -2,6 +2,7 @@ import torch import warnings from sys import maxsize as maxsize +from typing import Set import torch.onnx # This import monkey-patches graph manipulation methods on Graph, used for the @@ -125,7 +126,7 @@ def decorator(fn): def wrapper(g, *args, **kwargs): # some args may be optional, so the length may be smaller assert len(arg_descriptors) >= len(args) - args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)] + args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)] # type: ignore # only support _outputs in kwargs assert len(kwargs) <= 1 if len(kwargs) == 1: @@ -232,11 +233,11 @@ def _select_helper(g, self, dim, index, apply_reshape=True): def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False): if _export_onnx_opset_version <= 9: - from torch.onnx.symbolic_opset9 import _slice - return _slice(g, input, axes, starts, ends) + from torch.onnx.symbolic_opset9 import _slice as _slice9 + return _slice9(g, input, axes, starts, ends) else: - from torch.onnx.symbolic_opset10 import _slice - return _slice(g, input, axes, starts, ends, steps, dynamic_slice) + from torch.onnx.symbolic_opset10 import _slice as _slice10 + return _slice10(g, input, axes, starts, ends, steps, dynamic_slice) def _hardtanh_helper(g, input, min_val, max_val): if _export_onnx_opset_version <= 10: @@ -380,7 +381,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_ size = g.op("Concat", *size, axis_i=0) scale_factor = _interpolate_size_to_scales(g, input, size, dim) else: - return _unimplemented("Both size and scales are None in __interpolate") + return _unimplemented("interpolate", "Both size and scales are None in __interpolate") return scale_factor, mode @@ -388,7 +389,7 @@ def _unbind_helper(g, self, dim, _outputs): if _export_onnx_opset_version <= 9: from torch.onnx.symbolic_opset9 import unbind else: - from torch.onnx.symbolic_opset11 import unbind + from torch.onnx.symbolic_opset11 import unbind # type: ignore[no-redef] return unbind(g, self, dim, _outputs) @@ -396,7 +397,8 @@ def _scatter_helper(g, self, dim, index, src): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import scatter else: - from torch.onnx.symbolic_opset11 import scatter + # for mypy, scatter was imported two lines above + from torch.onnx.symbolic_opset11 import scatter # type: ignore return scatter(g, self, dim, index, src) @@ -444,7 +446,8 @@ def _index_fill_reshape_helper(g, self, dim, index): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import scatter else: - from torch.onnx.symbolic_opset11 import scatter + # for mypy, scatter was imported two lines above + from torch.onnx.symbolic_opset11 import scatter # type: ignore if self.type().dim() is None: return _unimplemented("index_fill", "input rank not accesible") @@ -632,4 +635,4 @@ def _cast_func_template(to_i, g, input, non_blocking): # Global set to store the list of quantized operators in the network. # This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX. -_quantized_ops = set() +_quantized_ops: Set[int] = set() diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py index c0c1d48ebec0..e4023dab2320 100644 --- a/torch/onnx/symbolic_opset8.py +++ b/torch/onnx/symbolic_opset8.py @@ -4,7 +4,7 @@ import torch.onnx.symbolic_opset9 as sym_opset9 from torch.onnx.symbolic_helper import parse_args, _unimplemented, _block_list_in_opset, _try_get_scalar_type -from torch.onnx.symbolic_opset9 import _cast_Float +from torch.onnx.symbolic_opset9 import _cast_Float # type: ignore import warnings diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index e395ce5c703f..8630f48a62ad 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -13,6 +13,8 @@ import torch.onnx.symbolic_helper as sym_help from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented +from typing import Optional + import numpy import math import warnings @@ -311,7 +313,7 @@ def _maybe_cast_reduce_op_input(g, self): if dtype is not None: # pytorch reduce-ops cast all other integral types to int64 if not sym_help._is_fp(self) and not (dtype == 'Long'): - self = _cast_Long(g, self, False) + self = _cast_Long(g, self, False) # type: ignore return self @@ -2092,7 +2094,7 @@ def _pack_padded_sequence(g, input, lengths, batch_first): # It's really only necessary because those operators expand to something that # only works with int32 types in Caffe2... if lengths.type().scalarType() != 'Int': - lengths = _cast_Int(g, lengths, False) + lengths = _cast_Int(g, lengths, False) # type: ignore return g.op("prim::PackPadded", input, lengths, outputs=2) @@ -2436,7 +2438,7 @@ def _get_arange_dtype(dtype): def masked_fill(g, self, mask, value): - mask = _cast_Bool(g, mask, False) + mask = _cast_Bool(g, mask, False) # type: ignore value = sym_help._maybe_get_scalar(value) return g.op('Where', mask, sym_help._if_scalar_type_as(g, value, self), self) @@ -2734,6 +2736,7 @@ def as_strided(g, self, sizes, strides, offset=None): sizes = sym_help._maybe_get_const(sizes, 'is') rank = len(strides) self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + ind: Optional[torch.Tensor] if not sym_help._is_value(sizes): ind = torch.tensor([0], dtype=torch.long) for i, (size, stride) in enumerate(zip(sizes, strides)): diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py index 48114d6c472b..c059e8f2eb31 100644 --- a/torch/onnx/symbolic_registry.py +++ b/torch/onnx/symbolic_registry.py @@ -1,6 +1,7 @@ import warnings import importlib from inspect import getmembers, isfunction +from typing import Dict, Tuple, Any, Union # The symbolic registry "_registry" is a dictionary that maps operators # (for a specific domain and opset version) to their symbolic functions. @@ -8,9 +9,9 @@ # The keys are tuples (domain, version), (where domain is a string, and version is an int), # and the operator's name (string). # The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic -_registry = {} +_registry: Dict[Tuple[str, int], Dict] = {} -_symbolic_versions = {} +_symbolic_versions: Dict[Union[int, str], Any] = {} from torch.onnx.symbolic_helper import _onnx_stable_opsets for opset_version in _onnx_stable_opsets: module = importlib.import_module('torch.onnx.symbolic_opset{}'.format(opset_version)) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 5c41306b9ee2..3fe19a56c124 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -18,6 +18,7 @@ from torch.jit import _unique_state_dict from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto +from typing import Union, Tuple, List # the flag to tell the user whether it's in the middle of ONNX export or not @@ -76,7 +77,7 @@ def export(model, args, f, export_params=True, verbose=False, training=None, if aten or export_raw_ir: assert operator_export_type is None assert aten ^ export_raw_ir - operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW + operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW elif operator_export_type is None: if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE: operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK @@ -351,6 +352,7 @@ def _trace_and_get_graph_from_model(model, args): def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes): torch_out = None + params: Union[List, Tuple] if isinstance(model, torch.jit.ScriptModule): try: graph = model.forward.graph @@ -442,7 +444,7 @@ def _model_to_graph(model, args, verbose=False, param_names = input_and_param_names[len(input_and_param_names) - len(params):] params_dict = dict(zip(param_names, params)) - if training is None or training == TrainingMode.EVAL or (training == TrainingMode.PRESERVE and not is_originally_training): + if training is None or training == TrainingMode.EVAL: params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict) if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: @@ -476,7 +478,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t if aten or export_raw_ir: assert operator_export_type is None assert aten ^ export_raw_ir - operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW + operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW elif operator_export_type is None: operator_export_type = OperatorExportTypes.ONNX return _export_to_pretty_string(model, args, f, export_params, verbose, training, @@ -1051,6 +1053,10 @@ def _graph_constant(g, value, dims, type, *args, **kwargs): dims = [1] isscalar = True type = type.lower() + tensor: Union[torch.CharTensor, torch.ShortTensor, + torch.IntTensor, torch.LongTensor, + torch.HalfTensor, torch.FloatTensor, + torch.DoubleTensor] if type == "char": tensor = torch.CharTensor(*dims) elif type == "short": @@ -1068,7 +1074,7 @@ def _graph_constant(g, value, dims, type, *args, **kwargs): else: raise ValueError("Unknown type, type should be one of the following strings: " "char, short, int, long, half, float, double") - tensor.fill_(value) + tensor.fill_(value) # type: ignore if isscalar: return g.op("Constant", *args, value_z=tensor, **kwargs) return g.op("Constant", *args, value_t=tensor, **kwargs) @@ -1141,8 +1147,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names): dynamic_axes[key] = value_dict -torch._C.Graph.op = _graph_op -torch._C.Graph.at = _graph_at -torch._C.Block.op = _block_op -torch._C.Graph.constant = _graph_constant -torch._C.Node.__getitem__ = _node_getitem +torch._C.Graph.op = _graph_op # type: ignore +torch._C.Graph.at = _graph_at # type: ignore +torch._C.Block.op = _block_op # type: ignore +torch._C.Graph.constant = _graph_constant # type: ignore +torch._C.Node.__getitem__ = _node_getitem # type: ignore From 36df25334f89aca54232a3947cdaaaa066f289ac Mon Sep 17 00:00:00 2001 From: X Wang <24860335+xwang233@users.noreply.github.com> Date: Mon, 7 Dec 2020 08:24:16 -0800 Subject: [PATCH 007/250] Fix incorrect usage of CUDACachingAllocator [v2] (#48817) Summary: This is similar to https://github.com/pytorch/pytorch/issues/46605, where the c10::complex part of the code was not merged yet at that moment. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48817 Reviewed By: malfet Differential Revision: D25333179 Pulled By: ezyang fbshipit-source-id: a92bdad5ad4b36bef7f050b21a59676c38e7b1fc --- aten/src/ATen/cuda/CUDASolver.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/cuda/CUDASolver.cpp b/aten/src/ATen/cuda/CUDASolver.cpp index 00329acda4a9..bcd630a06b9e 100644 --- a/aten/src/ATen/cuda/CUDASolver.cpp +++ b/aten/src/ATen/cuda/CUDASolver.cpp @@ -46,14 +46,14 @@ void getrf>( TORCH_CUSOLVER_CHECK(cusolverDnZgetrf_bufferSize( handle, m, n, reinterpret_cast(dA), ldda, &lwork)); auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); - void* buffer = allocator.allocate(sizeof(cuDoubleComplex) * lwork).get(); + auto dataPtr = allocator.allocate(sizeof(cuDoubleComplex) * lwork); TORCH_CUSOLVER_CHECK(cusolverDnZgetrf( handle, m, n, reinterpret_cast(dA), ldda, - static_cast(buffer), + static_cast(dataPtr.get()), ipiv, info)); } @@ -71,14 +71,14 @@ void getrf>( TORCH_CUSOLVER_CHECK(cusolverDnCgetrf_bufferSize( handle, m, n, reinterpret_cast(dA), ldda, &lwork)); auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); - void* buffer = allocator.allocate(sizeof(cuComplex) * lwork).get(); + auto dataPtr = allocator.allocate(sizeof(cuComplex) * lwork); TORCH_CUSOLVER_CHECK(cusolverDnCgetrf( handle, m, n, reinterpret_cast(dA), ldda, - static_cast(buffer), + static_cast(dataPtr.get()), ipiv, info)); } From 21ba48fe4955f7fe144a1c9dd239726d24ed67cd Mon Sep 17 00:00:00 2001 From: Ivan Kobzarev Date: Mon, 7 Dec 2020 08:43:01 -0800 Subject: [PATCH 008/250] [vulkan] test_app for mobilenetV2 on vulkan api (#48924) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48924 Test Plan: Imported from OSS Reviewed By: SS-JIA Differential Revision: D25365000 Pulled By: IvanKobzarev fbshipit-source-id: 79295b5781d2494681dbb4e4a741de49ff9c058c --- android/test_app/app/build.gradle | 20 +++++----- .../ATen/native/vulkan/ops/Convolution.cpp | 37 +++++++++++++++---- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/android/test_app/app/build.gradle b/android/test_app/app/build.gradle index 37bdb35e2f19..df7b758e3b31 100644 --- a/android/test_app/app/build.gradle +++ b/android/test_app/app/build.gradle @@ -60,20 +60,20 @@ android { //} flavorDimensions "model", "build", "activity" productFlavors { - mbq { + mnet { dimension "model" - applicationIdSuffix ".mbq" - buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2q.pt\"") - addManifestPlaceholders([APP_NAME: "MBQ"]) - buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"") + applicationIdSuffix ".mnet" + buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet.pt\"") + addManifestPlaceholders([APP_NAME: "MNET"]) + buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"") } - mbvulkan { + mnetVulkan { dimension "model" - applicationIdSuffix ".mbvulkan" - buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"") + applicationIdSuffix ".mnet_vulkan" + buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet_vulkan.pt\"") buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true') - addManifestPlaceholders([APP_NAME: "MBQ"]) - buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"") + addManifestPlaceholders([APP_NAME: "MNET_VULKAN"]) + buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"") } resnet18 { dimension "model" diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index a77b1935eda6..5af2c14b80cb 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -119,17 +119,40 @@ vTensor pack_weights( } // shader KO4C4HW_to_image - float image[4 * C_4][OC_4][KH * KW][4]; - memset(image, 0.f, 16 * C_4 * OC_4 * KH * KW * sizeof(float)); + struct Image3D { + float* data_; + uint32_t dim0_, dim1_, dim2_; + + Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) { + dim0_ = dim0; + dim1_ = dim1; + dim2_ = dim2; + data_ = new float[dim0 * dim1 * dim2 * 4]; + memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float)); + } + + inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_; + } + + void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) { + data_[idx(i0, i1, i2, i3)] = value; + } + + float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + return data_[idx(i0, i1, i2, i3)]; + } + } image{4 * C_4, OC_4, KH * KW}; + for (uint32_t sx = 0; sx < C_4; ++sx) { for (uint32_t sy = 0; sy < OC_4; ++sy) { for (uint32_t sz = 0; sz < (KH * KW); ++sz) { for (uint32_t vi = 0; vi < 4; ++vi) { int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz; - image[4 * sx + 0][sy][sz][vi] = dst[4 * (bufferVIdx + 0) + vi]; - image[4 * sx + 1][sy][sz][vi] = dst[4 * (bufferVIdx + 1) + vi]; - image[4 * sx + 2][sy][sz][vi] = dst[4 * (bufferVIdx + 2) + vi]; - image[4 * sx + 3][sy][sz][vi] = dst[4 * (bufferVIdx + 3) + vi]; + image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]); + image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]); + image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]); + image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]); } } } @@ -143,7 +166,7 @@ vTensor pack_weights( for (uint32_t sy = 0; sy < H; ++sy) { for (uint32_t sz = 0; sz < D; ++sz) { for (uint32_t szvi = 0; szvi < 4; ++szvi) { - dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image[sx][sy][sz][szvi]; + dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi); } } } From f2c3efd51fa7040b6390ee2b483176d97a530102 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Mon, 7 Dec 2020 09:36:20 -0800 Subject: [PATCH 009/250] Fix generator exhaustion in SparseAdam (#47724) Summary: Fixes https://github.com/pytorch/pytorch/issues/47594 Pull Request resolved: https://github.com/pytorch/pytorch/pull/47724 Reviewed By: heitorschueroff Differential Revision: D25304131 Pulled By: albanD fbshipit-source-id: 67c058b0836b9b4fba4f7b966396e4f3fa61f939 --- torch/optim/sparse_adam.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py index e1315e370269..909aa0c6cc62 100644 --- a/torch/optim/sparse_adam.py +++ b/torch/optim/sparse_adam.py @@ -32,6 +32,8 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8): if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + params = list(params) + sparse_params = [] for index, param in enumerate(params): if isinstance(param, dict): From ba6511b304a50ef8261692922a7038ca3eb48dd3 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 7 Dec 2020 10:37:38 -0800 Subject: [PATCH 010/250] pyi codegen update - remove Declarations.yaml (#48754) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48754 The goal of this PR is to kill Declarations.yaml in the pyi codegen, in favor of native_functions + the existing python object model. **High-level design** Since the python signatures used by the `python_arg_parser` are “supposed” to resemble the corresponding pyi type hint signatures, I re-used the existing python object model that Jiakai defined in `tools/codegen/api/python.py`. This means that the pyi codegen now reads `native_functions.yaml`, parses it into a bunch of `PythonSignatureGroup` objects, and emits corresponding method + function variants of type-hint signatures for each one, respectively into `__init__.pyi` and `_VariableFunctions.pyi`. What makes this uglier is that pyi and the python arg parser have a number of differences in how they’re emitted. I expressed that through a `pyi` flag on the `PythonSignature` dataclass, that tells it whether or not to print itself as a pyi vs. arg_parser signature. One thing worth noting is how pyi generates signatures differently for native / deprecated op signatures. For native ops: - The pyi codegen fuses functional and out variants of each op into a single signature with an optional `out` argument. Ops without an `out` variant just get an ordinary functional signature. - Some ops that fit certain criteria also get a second “varargs” signature - basically ops with a single positional argument of type List[int]. For deprecated signatures: - Functional and out variants are not fused - they each get their own signature entry - There are no varargs signatures This is currently implemented through the `signature_str()` and `signature_str_vararg()` methods on the `PythonSignature`/`PythonSignatureDeprecated` classes. `signature_str()` knows how to print itself with/without out arguments, differently for native/deprecated ops. `signature_str_vararg()` optionally returns a vararg variant of the signature if one exists. **Calling out the gap between python_arg_parser vs. pyi** The two formats are notably different, so I don’t think we can expect to unify them completely. That said, I encountered a number of differences in the pyi codegen that looked wrong- I tried to call them out in the PR, to be removed later. Just as an example, looking at the `svd` signature in the python_arg_parser vs. the pyi type hint: python_arg_parser ``` Static PythonArgParser parser({ “svd(Tensor input, bool some=True, bool compute_uv=True, *, TensorList[3] out=None”, }, /*traceable=*/true); ``` Pyi ``` def svd(input: Tensor, some: _bool=True, compute_uv: _bool=True, *, out: Optional[Tensor]=None) -> namedtuple_U_S_V: … ``` The two have obvious syntactic differences that we probably don’t plan on changing: the python_arg_parser doesn’t include `def` or return types, and it includes the type hint before the variable name. But the type of `out` in pyi is probably wrong, since `svd` has multiple output params. I tried to clearly call out any instances of the pyi codegen diverging in a way that looks buggy, so we can clean it up in a later PR (see the comments for details). Another particularly ugly “bug” that I kept in to maintain byte-for-byte compatibility is the fact that the pyi codegen groups operator overloads together. It turns out that the only reason it does this (as far as I can tell) is because is tacks on an out argument to signatures that don’t have one, if ANY overloads of that op have an out variant. E.g. consider the pyi type hints generated for `nanmedian` in `_VF.pyi`: ``` overload def nanmedian(input: Tensor, *, out: Optional[Tensor]=None) -> Tensor: ... overload def nanmedian(input: Tensor, dim: _int, keepdim: _bool=False, *, out: Optional[Tensor]=None) -> namedtuple_values_indices: ... overload def nanmedian(input: Tensor, dim: Union[str, ellipsis, None], keepdim: _bool=False, *, out: Optional[Tensor]=None) -> namedtuple_values_indices: ... ``` And the corresponding native_functions.yaml entries: ``` - func: nanmedian(Tensor self) -> Tensor - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - func: nanmedian.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) ``` Signature 2 corresponds to entries 2 and 3 in native_functions, and Signature 3 corresponds to entries 4 and 5. But signature 1 has an optional out argument, even though entry 1 in native_functions.yaml has no out variant. I’d like to delete that logic in a later PR- that will also have the added benefit no longer requiring to group overloads together in the pyi codegen. We can just operate independently on each PythonSignatureGroup. **More detailed accounting of the changes** Per file: gen_python_functions.py - `load_signatures()` can now skip deprecated signatures. Needed because pyi only includes deprecated functions, and skips their method variants (maybe we should add them in…?) - Moved `namedtuple_fieldnames` into python.cpp - `group_overloads()` can now opt to not sort the overloads (needed for byte-for-byte compact, pyi doesn’t sort for some reason) Python.py: - Gave `PythonSignature`and `PythonSignatureDeprecated` a `pyi` flag that tells it whether or not to print itself in pyi vs. python_arg_parser format - Added a `PythonReturns` dataclass , which is now a member of PythonSignature. It is only used by pyi. I found this useful because python returns need to know how to deal with named tuple returns properly. I also moved `namedtuple_fieldnames` into this file from gen_python_functions gen_pyi.py - Merged `get_py_torch_functions` and `get_py_variable_methods` into a single function, since they’re very similar - Lifted out all of the pyi type hint type-mapping mess and dropped it into python.py. This required updating the mapping to deal with NativeFunction objects instead of the outputs of Declarations.yaml (this was most of the logic in `type_to_python`, `arg_to_type_hint`, and `generate_type_hints`). `generate_type_hints` is now a small orchestration function that gathers the different signatures for each PythonSignatureGroup. - NamedTuples are now generated by calling `PythonReturn.named_tuple()` (in `generate_named_tuples()`), rather than appending to a global list A lot of hardcoded pyi signatures still live in `gen_pyi.py`. I didn’t look to closely into whether or not any of that can be removed as part of this PR. Test Plan: Imported from OSS Reviewed By: ljk53 Differential Revision: D25343802 Pulled By: bdhirsh fbshipit-source-id: f73e99e1afef934ff41e4aca3dabf34273459a52 --- .jenkins/pytorch/codegen-test.sh | 2 + mypy-strict.ini | 1 + tools/autograd/gen_python_functions.py | 47 ++- tools/autograd/utils.py | 16 +- tools/codegen/api/python.py | 277 +++++++++++++++-- tools/pyi/gen_pyi.py | 397 +++++++------------------ torch/CMakeLists.txt | 4 +- 7 files changed, 381 insertions(+), 363 deletions(-) diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh index 0f015df045c2..44f1e9449bf0 100755 --- a/.jenkins/pytorch/codegen-test.sh +++ b/.jenkins/pytorch/codegen-test.sh @@ -38,6 +38,8 @@ mkdir -p "$OUT"/pyi/torch/_C mkdir -p "$OUT"/pyi/torch/nn python -m tools.pyi.gen_pyi \ --declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \ + --native-functions-path aten/src/ATen/native/native_functions.yaml \ + --deprecated-functions-path tools/autograd/deprecated.yaml \ --out "$OUT"/pyi # autograd codegen (called by torch codegen but can run independently) diff --git a/mypy-strict.ini b/mypy-strict.ini index 42fc73abf1cc..ddd369ebe621 100644 --- a/mypy-strict.ini +++ b/mypy-strict.ini @@ -35,6 +35,7 @@ files = tools/codegen/gen.py, tools/autograd/gen_trace_type.py, tools/autograd/gen_variable_factories.py, tools/autograd/load_derivatives.py, + tools/pyi/gen_pyi.py, torch/utils/benchmark/utils/common.py, torch/utils/benchmark/utils/timer.py, torch/utils/benchmark/utils/valgrind_wrapper/*.py, diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 123a47f2aac2..63438a527b4c 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -193,25 +193,28 @@ def load_signatures( deprecated_yaml_path: str, *, method: bool, + skip_deprecated: bool = False, + pyi: bool = False, ) -> Sequence[PythonSignatureNativeFunctionPair]: native_functions = list(filter(should_generate_py_binding, parse_native_yaml(native_yaml_path))) @with_native_function def gen_signature_pairs(f: NativeFunction) -> PythonSignatureNativeFunctionPair: return PythonSignatureNativeFunctionPair( - signature=signature(f, method=method), + signature=signature(f, method=method, pyi=pyi), function=f, ) pairs = list(map(gen_signature_pairs, native_functions)) - deprecated = load_deprecated_signatures(pairs, deprecated_yaml_path, method=method) - return pairs + deprecated + deprecated = load_deprecated_signatures(pairs, deprecated_yaml_path, method=method, pyi=pyi) + return pairs if skip_deprecated else pairs + deprecated def load_deprecated_signatures( pairs: Sequence[PythonSignatureNativeFunctionPair], deprecated_yaml_path: str, *, method: bool, + pyi: bool, ) -> List[PythonSignatureNativeFunctionPair]: # The deprecated.yaml doesn't have complete type information, we need # find and leverage the original ATen signature (to which it delegates @@ -225,6 +228,10 @@ def signature_original(f: NativeFunction) -> str: opname = str(f.func.name.name.base) if f.func.is_out_fn(): opname += '_out' + # TODO: remove HACK + # I think we want to differentiate inplace functions here.. but we currently don't for the arg parser + if f.func.name.name.inplace and pyi: + opname += '_' args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments() # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml. types = ', '.join(argument_type_str(a.argument.type) @@ -308,6 +315,7 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) - method=python_sig.method, deprecated_args_names=tuple(args), deprecated_args_exprs=tuple(call_args), + returns=python_sig.returns, ), function=pair.function, )) @@ -320,31 +328,10 @@ def signature_deprecated(opname: str, params: List[str], call_args: List[str]) - # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# TODO: remove the copy of this method in 'tools/pyi/gen_pyi.py'. -@with_native_function -def namedtuple_fieldnames(f: NativeFunction) -> List[str]: - returns = f.func.returns - if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)): - return [] - else: - if any(map(lambda r: r.name is None, returns)): - # When building on Windows, `PyStructSequence_UnnamedField` could not be - # resolved by the linker for some reason, which cause error in building: - # - # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol - # PyStructSequence_UnnamedField - # - # Thus, at this point in time, we do not support unnamed - # fields in namedtuple; you must either name all fields, - # or none of them. - raise ValueError("Unnamed field is not supported by codegen") - - return list(map(lambda r: str(r.name), returns)) - @with_native_function def gen_namedtuple_typename_key(f: NativeFunction) -> str: name = cpp.name(f.func) - fieldnames = namedtuple_fieldnames(f) + fieldnames = namedtuple_fieldnames(f.func.returns) return '_'.join([name] + fieldnames) def emit_namedtuple_typedefs( @@ -360,7 +347,7 @@ def emit_namedtuple_typedefs( typedefs: List[str] = [] # typedef declarations and init code for overload in overloads: - fieldnames = namedtuple_fieldnames(overload.function) + fieldnames = namedtuple_fieldnames(overload.function.func.returns) if not fieldnames: continue @@ -651,7 +638,9 @@ def method_def( # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # def group_overloads( - overloads: Sequence[PythonSignatureNativeFunctionPair] + overloads: Sequence[PythonSignatureNativeFunctionPair], + *, + sort: bool = True, ) -> Sequence[PythonSignatureGroup]: bases: Dict[str, PythonSignatureNativeFunctionPair] = {} outplaces: Dict[str, PythonSignatureNativeFunctionPair] = {} @@ -700,7 +689,9 @@ def group_overloads( outplace=outplace.function if outplace is not None else None, )) - return sort_overloads(grouped) + # TODO: unconditionally sort + # maintaining byte-for-byte compatibility for pyi codegen for now + return grouped if not sort else sort_overloads(grouped) # This function declares a partial order on declarations, and sorts them according # to its linear extension. This is necessary, because there's some ambiguity in the diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py index 5c0fcccc4c78..b4889d219e9c 100644 --- a/tools/autograd/utils.py +++ b/tools/autograd/utils.py @@ -1,8 +1,8 @@ import re import os import yaml -from collections import defaultdict from .nested_dict import nested_dict +from typing import Dict, List __all__ = [ @@ -52,7 +52,7 @@ def uninplace_api_name(api_name): return api_name -def write(dirname, name, template, env): +def write(dirname: str, name: str, template: CodeTemplate, env: Dict[str, List[str]]) -> None: env['generated_comment'] = GENERATED_COMMENT.substitute(filename=template.filename) path = os.path.join(dirname, name) # See Note [Unchanging results for ninja] @@ -69,12 +69,6 @@ def write(dirname, name, template, env): else: print("Skipped writing {}".format(path)) -def is_tensor_method(declaration): - return 'Tensor' in declaration['method_of'] - -def is_torch_function(declaration): - return 'namespace' in declaration['method_of'] - def is_out_variant(decl): return decl['name'].endswith('_out') @@ -92,12 +86,6 @@ def load_op_list_and_strip_overload(op_list, op_list_path): # strip out the overload part return {opname.split('.', 1)[0] for opname in op_list} -def group_declarations_by_op_name(declarations): - groups = defaultdict(list) - for d in declarations: - groups[op_name(d)].append(d) - return groups - def is_output(arg): return arg.get('output', False) diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 4b407d45553a..26b0f8eb8076 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -173,6 +173,53 @@ # } # +# TODO: stick this more firmly in the data model somewhere? +def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]: + if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)): + return [] + else: + if any(map(lambda r: r.name is None, returns)): + # When building on Windows, `PyStructSequence_UnnamedField` could not be + # resolved by the linker for some reason, which cause error in building: + # + # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol + # PyStructSequence_UnnamedField + # + # Thus, at this point in time, we do not support unnamed + # fields in namedtuple; you must either name all fields, + # or none of them. + raise ValueError("Unnamed field is not supported by codegen") + + return list(map(lambda r: str(r.name), returns)) + +@dataclass(frozen=True) +class PythonReturns: + returns: Tuple[Return, ...] + + def named_tuple_pyi(self) -> Optional[Tuple[str, str]]: + python_returns = [argument_type_str_pyi(r.type) for r in self.returns] + field_names = namedtuple_fieldnames(self.returns) + if field_names: + namedtuple_name = '_'.join(['namedtuple'] + field_names) + tuple_args = [f'("{name}", {typ})' for name, typ in zip(field_names, python_returns)] + namedtuple_def = f'NamedTuple("{namedtuple_name}", [{", ".join(tuple_args)}])' + return namedtuple_name, namedtuple_def + return None + + def returns_str_pyi(self) -> str: + named_tuple = self.named_tuple_pyi() + if named_tuple is not None: + namedtuple_name, _ = named_tuple + return namedtuple_name + + python_returns = [argument_type_str_pyi(r.type) for r in self.returns] + if len(python_returns) > 1: + return 'Tuple[' + ', '.join(python_returns) + ']' + if len(python_returns) == 1: + return python_returns[0] + return 'None' + + @dataclass(frozen=True) class PythonArgument: name: str @@ -189,26 +236,56 @@ class PythonArgument: # Compute argument formal for python argument parsing. # Needs to be consistent with torch/csrc/utils/python_arg_parser.h. - def argument_str(self, *, method: bool = False) -> str: - type_str = argument_type_str(self.type) + def argument_str(self, *, method: bool = False, pyi: bool = False, deprecated: bool = False) -> str: + type_str = argument_type_str_pyi(self.type, pyi_out_arg=pyi and isinstance(self, PythonOutArgument)) \ + if pyi else argument_type_str(self.type) + name = self.name # s/self/input/ outside method bindings # [old codegen] TODO: remove this? doesn't rename in codegen, it's just # for the parse string - name = self.name - if name == 'self' and type_str == 'Tensor' and not method: + if name == 'self' and type_str == 'Tensor' and not method and not deprecated: name = 'input' + if pyi: + if name == 'from': # from is a Python keyword... + name += '_' + # pyi merges the _out and functional variants into the same signature, with an optional out arg + if name == 'out' and type_str == 'Tensor' and not deprecated: + type_str = 'Optional[' + type_str + ']' + + # TODO: remove diff. pyi deprecated signatures don't get defaults for their out arg + treat_as_no_default = pyi and deprecated and isinstance(self, PythonOutArgument) and self.default == 'None' + # add default - if self.default is not None: - default = { - 'nullptr': 'None', - 'c10::nullopt': 'None', - '{}': 'None', - }.get(self.default, self.default) - return f'{type_str} {name}={default}' + if self.default is not None and not treat_as_no_default: + if pyi: + if isinstance(self.type, ListType) and self.type.elem == BaseType(BaseTy.int) and \ + self.default.startswith('{') and self.default.endswith('}'): + default = '(' + self.default[1:-1] + ')' + else: + default = { + 'nullptr': 'None', + 'c10::nullopt': 'None', + '{}': 'None', + 'MemoryFormat::Contiguous': 'contiguous_format', + 'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine', + }.get(self.default, self.default) + # TODO: remove requires_grad special case (byte-for-byte compat) + return f'{name}:{type_str}={default}' if name == 'requires_grad' else f'{name}: {type_str}={default}' + else: + default = { + 'nullptr': 'None', + 'c10::nullopt': 'None', + '{}': 'None', + }.get(self.default, self.default) + return f'{type_str} {name}={default}' else: - return f'{type_str} {name}' + if pyi: + # TODO: remove requires_grad special case (byte-for-byte compat) + return f'{name}:{type_str}' if name == 'requires_grad' else f'{name}: {type_str}' + else: + return f'{type_str} {name}' @dataclass(frozen=True) class PythonOutArgument(PythonArgument): @@ -238,6 +315,7 @@ def from_outputs(outputs: Tuple[PythonArgument, ...]) -> Optional['PythonOutArgu raise RuntimeError(f'Unsupported output type: {outputs}') return PythonOutArgument( name='out', + # TODO: shouldn't this be OptionalType[ListType[...]], since it defaults to None? type=ListType(BaseType(BaseTy.Tensor), size), default='None', default_init=None, @@ -260,6 +338,9 @@ class PythonSignature: output_args: Optional[PythonOutArgument] + # Return types, which are only used by pyi + returns: PythonReturns + # These are scattered kwargs arguments belonging to TensorOptions. # When binding to C++, they are packed into a TensorOptions object 'options'. # It's possible that the C++ signature doesn't take TensorOptions object (e.g. @@ -276,13 +357,23 @@ def deprecated(self) -> bool: return False def arguments( - self, *, skip_outputs: bool = False, skip_tensor_options: bool = False + self, *, skip_outputs: bool = False, skip_tensor_options: bool = False, hacky_add_output: bool = False ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]: result: List[Union[PythonArgument, PythonOutArgument]] = [] result.extend(self.input_args) result.extend(self.input_kwargs) if self.output_args is not None and not skip_outputs: result.append(self.output_args) + # TODO: remove HACK + # in the existing pyi codegen, we tack on an optional out argument to every operator overload + # if there exists at least one overload with an out variant. This seems wrong. + elif hacky_add_output: + result.extend([PythonOutArgument( + name='out', + type=OptionalType(BaseType(BaseTy.Tensor)), + default='None', + default_init=None, + outputs=())]) if not skip_tensor_options: result.extend(self.tensor_options_args) return tuple(result) @@ -301,18 +392,57 @@ def output_idx(self) -> int: # for error parsing. # # For a translation to mypy-valid type signatures, see - # tools/gen_pyi.py. If you change any logic here, please + # signature_str_pyi. If you change any logic here, please # check that file too. def signature_str(self, *, skip_outputs: bool = False) -> str: - schema_formals: List[str] = \ - list(map(lambda a: a.argument_str(method=self.method), - self.arguments(skip_outputs=skip_outputs))) + args = self.arguments(skip_outputs=skip_outputs) + schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method), args)) positional_argc = len(self.input_args) if len(schema_formals) > positional_argc: schema_formals.insert(positional_argc, '*') return f'{self.name}({", ".join(schema_formals)})' + def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str: + args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) + schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True), args)) + positional_argc = len(self.input_args) + if len(schema_formals) > positional_argc: + schema_formals.insert(positional_argc, '*') + + # only pyi signatures include returns + returns_str = self.returns.returns_str_pyi() + # pyi also includes self (with no typing/defaults) for methods + if self.method: + schema_formals.insert(0, "self") + return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' + + def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]: + # only pyi uses vararg signatures + args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) + schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True), args)) + # vararg only applies to pyi signatures. vararg variants are not generated for all signatures + num_args = self.arguments_count() + num_positionalargs = len(self.input_args) + + have_vararg_version = False + if num_args > 0: + vararg_type = args[0].type + if isinstance(vararg_type, ListType) and str(vararg_type.elem) == 'int' and num_positionalargs == 1: + have_vararg_version = True + + if not have_vararg_version: + return None + # Below are the major changes in vararg vs. regular pyi signatures + # vararg signatures also omit the asterix + schema_formals[0] = '*' + args[0].name + ': _int' + + returns_str = self.returns.returns_str_pyi() + # pyi also includes self (with no typing/defaults) for methods + if self.method: + schema_formals.insert(0, "self") + return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' + # The deprecated python signature involves some special logic, so create a # dedicated data model to store these extra properties. @dataclass(frozen=True) @@ -340,6 +470,20 @@ def deprecated(self) -> bool: def signature_str(self, *, skip_outputs: bool = False) -> str: return PythonSignature.signature_str(self, skip_outputs=skip_outputs) + '|deprecated' + def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str: + args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) + schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True, deprecated=True), args)) + positional_argc = len(self.input_args) + if len(schema_formals) > positional_argc: + schema_formals.insert(positional_argc, '*') + + returns_str = self.returns.returns_str_pyi() + return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' + + def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]: + # the codegen doesn't include vararg variants for deprecated signatures + return None + # This struct is used to hold the PythonSignature and its corresponding # NativeFunction BEFORE grouping base and out-variant functions. # Why not store NativeFunction in PythonSignature or construct PythonSignature @@ -520,12 +664,75 @@ def argument(a: Argument) -> PythonArgument: default_init=None, ) -def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature: +def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str: + add_optional = False + if isinstance(t, OptionalType): + t = t.elem + add_optional = True + + if isinstance(t, BaseType): + if t.name == BaseTy.int: + ret = '_int' + elif t.name == BaseTy.float: + ret = '_float' + elif t.name == BaseTy.str: + ret = 'str' + elif t.name == BaseTy.Scalar: + ret = 'Number' + elif t.name == BaseTy.ScalarType: + ret = '_dtype' + elif t.name == BaseTy.bool: + ret = '_bool' + elif t.name == BaseTy.QScheme: + ret = '_qscheme' + elif t.name == BaseTy.Layout: + ret = '_layout' + elif t.name == BaseTy.Device: + ret = 'Union[_device, str, None]' + elif t.name == BaseTy.MemoryFormat: + ret = 'memory_format' + elif t.name == BaseTy.Dimname: + ret = 'Union[str, ellipsis, None]' + elif t.name in [BaseTy.Tensor, BaseTy.Generator, + BaseTy.Storage, BaseTy.Stream, BaseTy.str]: + # These python schema type names line up with their function schema names + ret = t.name.name + + elif isinstance(t, ListType): + if pyi_out_arg and t.is_tensor_like(): + # TODO remove HACK + # pyi blindly treats all tensor-like out args as having type Tensor + return 'Tensor' + if str(t.elem) == 'int': + ret = 'Union[_int, _size]' if t.size is not None else '_size' + elif t.is_tensor_like(): + # TODO: this doesn't seem right... + # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]] + # It should probably translate to Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]] + if isinstance(t.elem, OptionalType): + add_optional = True + ret = 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]' if t.size is not None else \ + 'Union[Tuple[Tensor, ...], List[Tensor]]' + elif str(t.elem) == 'float': + ret = 'Sequence[float]' + else: + elem = argument_type_str_pyi(t.elem) + ret = f'Sequence[{elem}]' + + if add_optional: + ret = 'Optional[' + ret + ']' + return ret + + raise RuntimeError(f'unrecognized type {repr(t)}') + +# Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen +def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> PythonSignature: # Use cpp api to gather TensorOptions fields from kwargs. - # Skip ThisArgument if this is method signature. + # Skip SelfArgument if this is method. # Skip TensorOptionsArguments in C++ signature. Python side TensorOptions # arguments are created based on different rules - see below. - args = tuple(a for a in cpp.group_arguments(f.func, method=method) if isinstance(a, Argument)) + cpp_args = cpp.group_arguments(f.func, method=method) + args = tuple(a for a in cpp_args if isinstance(a, Argument)) input_arg_set = set(a.name for a in f.func.arguments.positional) kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only) @@ -561,13 +768,15 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature: tensor_options_args.append(PythonArgument( name='dtype', type=BaseType(BaseTy.ScalarType), - default=_dtype_default_type_hack(name), + default=_dtype_default_type_hack(name, pyi=pyi), default_init='self.scalar_type()' if is_like_or_new_function else None, )) + # TODO: probably a bug, kill this diff? + # pyi signatures have a slightly different type/default for layout tensor_options_args.append(PythonArgument( name='layout', - type=OptionalType(BaseType(BaseTy.Layout)), - default='torch.strided', + type=BaseType(BaseTy.Layout) if pyi else OptionalType(BaseType(BaseTy.Layout)), + default='strided' if pyi else 'torch.strided', default_init='layout_from_backend(self.options().backend())' if is_like_or_new_function else None, )) tensor_options_args.append(PythonArgument( @@ -576,12 +785,15 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature: default='None', default_init='self.device()' if is_like_or_new_function else None, )) - tensor_options_args.append(PythonArgument( - name='pin_memory', - type=BaseType(BaseTy.bool), - default='False', - default_init=None, - )) + # TODO: probably a bug, kill this diff? + # pyi signatures don't include pin memory + if not pyi: + tensor_options_args.append(PythonArgument( + name='pin_memory', + type=BaseType(BaseTy.bool), + default='False', + default_init=None, + )) tensor_options_args.append(PythonArgument( name='requires_grad', type=BaseType(BaseTy.bool), @@ -589,18 +801,21 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature: default_init=None, )) + returns = PythonReturns(returns=f.func.returns) + return PythonSignature( name=str(f.func.name.name), input_args=input_args, input_kwargs=input_kwargs, output_args=PythonOutArgument.from_outputs(outputs), tensor_options_args=tuple(tensor_options_args), + returns=returns, method=method, ) # TODO blowtorch -def _dtype_default_type_hack(name: str) -> str: - if name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices': +def _dtype_default_type_hack(name: str, *, pyi: bool) -> str: + if not pyi and (name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices'): return 'torch.int64' else: return 'None' diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 617f997a8d76..ee5c38a4cf1c 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -3,13 +3,14 @@ import collections from pprint import pformat -import yaml -import re import argparse -from ..autograd.utils import YamlLoader, CodeTemplate, write, group_declarations_by_op_name, is_tensor_method, is_torch_function -from ..autograd.gen_python_functions import SKIP_PYTHON_BINDINGS, SKIP_PYTHON_BINDINGS_SIGNATURES -from ..autograd.gen_autograd import load_aten_declarations +from tools.codegen.model import * +from tools.codegen.api.python import * +from typing import Sequence, List, Mapping, Dict + +from ..autograd.utils import CodeTemplate, write +from ..autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads """ This module implements generation of type stubs for PyTorch, @@ -28,60 +29,48 @@ (the latter case should be pretty rare). - We go through automatically bound functions based on the - type information recorded in Declarations.yaml and + type information recorded in native_functions.yaml and generate type hints for them (generate_type_hints) There are a number of type hints which we've special-cased; read gen_pyi for the gory details. """ -# TODO: remove after migrating entire codegen to the new data model. -def should_generate_python_binding(declaration): - name = declaration['name'] - for pattern in SKIP_PYTHON_BINDINGS: - if re.match('^' + pattern + '$', name): - return False - - simple_types = [arg['simple_type'] for arg in declaration['arguments']] - signature = '{}({})'.format(name, ', '.join(simple_types)) - for pattern in SKIP_PYTHON_BINDINGS_SIGNATURES: - if pattern == signature: - return False - - return True - - -def get_py_variable_methods(declarations): +# TODO: consider waiting to group by base name until we actually need to +# (after computing type hint signatures, when adding @overload directives) +def group_by_base_name(python_funcs: Sequence[PythonSignatureNativeFunctionPair]) -> Mapping[str, List[PythonSignatureGroup]]: + groups = group_overloads(python_funcs, sort=False) + d = collections.defaultdict(list) + for g in groups: + name = g.signature.name + d[name].append(g) + return d + +def get_py_torch_functions( + python_funcs: Sequence[PythonSignatureNativeFunctionPair], + method: bool = False, +) -> Mapping[str, Sequence[PythonSignatureGroup]]: """ Get declarations (grouped by name) which should be generated - as methods on Tensor. + as either functions in the "torch" module or methods on Tensor. """ - def should_bind(declaration): - return (should_generate_python_binding(declaration) and - not declaration.get('python_module') and - is_tensor_method(declaration)) + def should_bind_function(python_func: PythonSignatureNativeFunctionPair) -> bool: + return (should_generate_py_binding(python_func.function) and + not python_func.function.python_module and + Variant.function in python_func.function.variants) - return group_declarations_by_op_name([d for d in declarations if should_bind(d)]) + def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool: + return (should_generate_py_binding(python_func.function) and + not python_func.function.python_module and + Variant.method in python_func.function.variants) - -def get_py_torch_functions(declarations): - """ - Get declarations (grouped by name) which should be generated - as functions in the "torch" module. - """ - def should_bind(declaration): - return (should_generate_python_binding(declaration) and - not declaration.get('python_module') and - is_torch_function(declaration)) - - return group_declarations_by_op_name([d for d in declarations if should_bind(d)]) + should_bind = should_bind_method if method else should_bind_function + return group_by_base_name([f for f in python_funcs if should_bind(f)]) # TODO: Consider defining some aliases for our Union[...] types, to make # the stubs to read on the human eye. -needed_modules = set() - DEVICE_PARAM = "device: Union[_device, str, None]=None" FACTORY_PARAMS = f"dtype: Optional[_dtype]=None, {DEVICE_PARAM}, requires_grad: _bool=False" @@ -144,90 +133,6 @@ def should_bind(declaration): ] -def type_to_python(typename, size=None): - """type_to_python(typename: str, size: str) -> str - - Transforms a Declarations.yaml type name into a Python type specification - as used for type hints. - """ - typename = typename.replace(' ', '') # normalize spaces, e.g., 'Generator *' - - # Disambiguate explicitly sized int/tensor lists from implicitly - # sized ones. These permit non-list inputs too. (IntArrayRef[] and - # TensorList[] are not real types; this is just for convenience.) - if typename in {'IntArrayRef', 'TensorList'} and size is not None: - typename += '[]' - - typename = { - 'Device': 'Device', - 'Generator': 'Generator', - 'IntegerTensor': 'Tensor', - 'Scalar': 'Number', - 'ScalarType': '_dtype', - 'Storage': 'Storage', - 'BoolTensor': 'Tensor', - 'IndexTensor': 'Tensor', - 'Tensor': 'Tensor', - 'MemoryFormat': 'memory_format', - 'IntArrayRef': '_size', - 'IntArrayRef[]': 'Union[_int, _size]', - 'TensorList': 'Union[Tuple[Tensor, ...], List[Tensor]]', - 'TensorList[]': 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]', - 'bool': '_bool', - 'double': '_float', - 'int64_t': '_int', - 'accreal': 'Number', - 'real': 'Number', - 'void*': '_int', # data_ptr - 'void': 'None', - 'std::string': 'str', - 'Dimname': 'Union[str, ellipsis, None]', - 'DimnameList': 'Sequence[Union[str, ellipsis, None]]', - 'QScheme': '_qscheme', - 'ArrayRef' : 'Sequence[float]', - 'Stream': 'Stream', - }[typename] - - return typename - - -def arg_to_type_hint(arg): - """arg_to_type_hint(arg) -> str - - This takes one argument in a Declarations and returns a string - representing this argument in a type hint signature. - """ - name = arg['name'] - if name == 'from': # from is a Python keyword... - name += '_' - typename = type_to_python(arg['dynamic_type'], arg.get('size')) - if arg.get('is_nullable'): - typename = 'Optional[' + typename + ']' - if 'default' in arg: - default = arg['default'] - if default == 'nullptr': - default = None - elif default == 'c10::nullopt': - default = None - elif isinstance(default, str) and default.startswith('{') and default.endswith('}'): - if arg['dynamic_type'] == 'Tensor' and default == '{}': - default = None - elif arg['dynamic_type'] == 'Generator' and default == '{}': - default = None - elif arg['dynamic_type'] == 'IntArrayRef': - default = '(' + default[1:-1] + ')' - else: - raise Exception("Unexpected default constructor argument of type {}".format(arg['dynamic_type'])) - elif default == 'MemoryFormat::Contiguous': - default = 'contiguous_format' - elif default == 'QScheme::PER_TENSOR_AFFINE': - default = 'per_tensor_affine' - default = '={}'.format(default) - else: - default = '' - return name + ': ' + typename + default - - binary_ops = ('add', 'sub', 'mul', 'div', 'pow', 'lshift', 'rshift', 'mod', 'truediv', 'matmul', 'floordiv', 'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rpow', # reverse arithmetic @@ -241,7 +146,7 @@ def arg_to_type_hint(arg): all_ops = binary_ops + comparison_ops + unary_ops + to_py_type_ops -def sig_for_ops(opname): +def sig_for_ops(opname: str) -> List[str]: """sig_for_ops(opname : str) -> List[str] Returns signatures for operator special functions (__add__ etc.)""" @@ -271,146 +176,66 @@ def sig_for_ops(opname): else: raise Exception("unknown op", opname) - -# Copied from 'gen_python_functions.py' -# TODO: consolidate after migrating to the new codegen model in 'tools/codegen'. -def namedtuple_fieldnames(declaration): - returns = declaration['returns'] - if len(returns) <= 1 or all(['field_name' not in x for x in returns]): - return [] - else: - def get_field_name(x): - # See Note [field_name versus name] - if 'field_name' not in x: - # When building on Windows, `PyStructSequence_UnnamedField` could not be - # resolved by the linker for some reason, which cause error in building: - # - # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol - # PyStructSequence_UnnamedField - # - # Thus, at this point in time, we do not support unnamed - # fields in namedtuple; you must either name all fields, - # or none of them. - raise ValueError("Unnamed field is not supported by codegen") +def generate_named_tuples(funcs: Sequence[PythonSignatureGroup]) -> Dict[str, str]: + namedtuples: Dict[str, str] = {} + for sig_group in funcs: + named_tuple = sig_group.signature.returns.named_tuple_pyi() + if named_tuple is not None: + tuple_name, tuple_def = named_tuple + if tuple_name in namedtuples: + assert namedtuples[tuple_name] == tuple_def else: - return x['field_name'] - return [get_field_name(x) for x in returns] - + namedtuples[tuple_name] = tuple_def + return namedtuples -def generate_type_hints(fname, decls, namedtuples, is_tensor=False): - """generate_type_hints(fname, decls, is_tensor=False) +def generate_type_hints(funcs: Sequence[PythonSignatureGroup], is_tensor: bool = False) -> List[str]: + """generate_type_hints(funcs, is_tensor=False) Generates type hints for the declarations pertaining to the function - :attr:`fname`. attr:`decls` are the declarations from the parsed - Declarations.yaml. - :attr:`namedtuples` is a dictionary for accumulating NamedTuple definitions. + :attr:`funcs` are the func from the parsed native_functions.yaml. The :attr:`is_tensor` flag indicates whether we are parsing members of the Tensor class (true) or functions in the `torch` namespace (default, false). - - This function currently encodes quite a bit about the semantics of - the translation C++ -> Python. """ - if fname in blocklist: - return [] type_hints = [] - dnames = ([d['name'] for d in decls]) - has_out = fname + '_out' in dnames - - if has_out: - decls = [d for d in decls if d['name'] != fname + '_out'] - - for decl in decls: - render_kw_only_separator = True # whether we add a '*' if we see a keyword only argument - python_args = [] - - has_tensor_options = 'TensorOptions' in (a['dynamic_type'] for a in decl['arguments']) - - for a in decl['arguments']: - if a['dynamic_type'] != 'TensorOptions': - if a.get('kwarg_only', False) and render_kw_only_separator: - python_args.append('*') - render_kw_only_separator = False - try: - python_args.append(arg_to_type_hint(a)) - except Exception: - print("Error while processing function {}".format(fname)) - raise - - if 'self: Tensor' in python_args: - self_index = python_args.index('self: Tensor') - python_args.remove('self: Tensor') - if is_tensor: - python_args = ['self'] + python_args - else: - python_args.insert(self_index, 'input: Tensor') - else: - if is_tensor: - raise Exception("method without self is unexpected") - - if has_out: - if render_kw_only_separator: - python_args.append('*') - render_kw_only_separator = False - python_args.append('out: Optional[Tensor]=None') - - if has_tensor_options: - if render_kw_only_separator: - python_args.append('*') - render_kw_only_separator = False - python_args += ["dtype: _dtype=None", - "layout: _layout=strided", - "device: Union[_device, str, None]=None", - "requires_grad:_bool=False"] - - python_args_s = ', '.join(python_args) - python_returns = [type_to_python(r['dynamic_type']) for r in decl['returns']] - field_names = namedtuple_fieldnames(decl) - - if field_names: - namedtuple_name = '_'.join(['namedtuple'] + field_names) - tuple_args = ['("{}", {})'.format(name, typ) for name, typ in zip(field_names, python_returns)] - namedtuple_def = 'NamedTuple("{}", [{}])'.format(namedtuple_name, ', '.join(tuple_args)) - if namedtuple_name in namedtuples: - assert namedtuples[namedtuple_name] == namedtuple_def - else: - namedtuples[namedtuple_name] = namedtuple_def - python_returns_s = namedtuple_name - elif len(python_returns) > 1: - python_returns_s = 'Tuple[' + ', '.join(python_returns) + ']' - elif len(python_returns) == 1: - python_returns_s = python_returns[0] - else: - python_returns_s = 'None' - - type_hint = "def {}({}) -> {}: ...".format(fname, python_args_s, python_returns_s) - numargs = len(decl['arguments']) - vararg_pos = int(is_tensor) - have_vararg_version = (numargs > vararg_pos and - decl['arguments'][vararg_pos]['dynamic_type'] in {'IntArrayRef'} and - (numargs == vararg_pos + 1 or python_args[vararg_pos + 1] == '*') and - (not is_tensor or decl['arguments'][0]['name'] == 'self')) + any_out = any([g for g in funcs if g.outplace is not None]) + + for sig_group in funcs: + # Some deprecated ops that are on the blocklist are still included in pyi + if sig_group.signature.name in blocklist and not sig_group.signature.deprecated: + continue + + # deprecated signatures have separate entries for their functional and out variants + # (as opposed to the native ops, which fuse the two into a single signature). + # generate the functional variant here, if an out variant exists. + if sig_group.signature.deprecated and sig_group.outplace is not None: + type_hint = sig_group.signature.signature_str_pyi(skip_outputs=True) + type_hints.append(type_hint) + # TODO: remove HACK + # the pyi codegen currently adds an optional out param in cases where the current op does NOT have an out variant, + # but an overload of the op DOES have an out variant. + # TODO: After that, we should consider killing this method entirely and operating per PythonSignatureGroup + # rather than grouping their overloads together + # (since there isn't much else semantically meaningful about grouping overloads) + # this hack also doesn't apply to deprecated ops + hacky_add_output = any_out and sig_group.outplace is None and not sig_group.signature.deprecated + # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument + # Generates the out variant if one exists. Otherwise, generate the functional variant + type_hint = sig_group.signature.signature_str_pyi( + skip_outputs=sig_group.outplace is None, hacky_add_output=hacky_add_output) type_hints.append(type_hint) - if have_vararg_version: - # Two things come into play here: PyTorch has the "magic" that if the first and only positional argument - # is an IntArrayRef, it will be used as a vararg variant. - # The following outputs the vararg variant, the "pass a list variant" is output above. - # The other thing is that in Python, the varargs are annotated with the element type, not the list type. - typelist = decl['arguments'][vararg_pos]['dynamic_type'] - vararg_type = '_int' - # replace first argument and eliminate '*' if present - python_args = ((['self'] if is_tensor else []) + ['*' + decl['arguments'][vararg_pos]['name'] + - ': ' + vararg_type] + python_args[vararg_pos + 2:]) - python_args_s = ', '.join(python_args) - type_hint = "def {}({}) -> {}: ...".format(fname, python_args_s, python_returns_s) - type_hints.append(type_hint) + # Some operators also additionally have a vararg variant of their signature + type_hint_vararg = sig_group.signature.signature_str_pyi_vararg( + skip_outputs=sig_group.outplace is None, hacky_add_output=hacky_add_output) + if type_hint_vararg: + type_hints.append(type_hint_vararg) return type_hints -def gen_nn_functional(out): +def gen_nn_functional(out: str) -> None: # Functions imported into `torch.nn.functional` from `torch`, perhaps being filtered # through an `_add_docstr` call imports = [ @@ -475,10 +300,10 @@ def gen_nn_functional(out): stubs = CodeTemplate.from_file(os.path.join('torch', '_C', '_nn.pyi.in')) write(out, 'torch/_C/_nn.pyi', stubs, env) -def gen_nn_pyi(out): +def gen_nn_pyi(out: str) -> None: gen_nn_functional(out) -def gen_pyi(declarations_path, out): +def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, out: str) -> None: """gen_pyi() This function generates a pyi file for torch. @@ -491,16 +316,13 @@ def gen_pyi(declarations_path, out): # checking. If you are update this, consider if your change # also needs to update the other file. - # Load information from YAML - declarations = load_aten_declarations(declarations_path) - # Dictionary for NamedTuple definitions - namedtuples = {} + namedtuples: Dict[str, str] = {} # Generate type signatures for top-level functions # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - unsorted_function_hints = collections.defaultdict(list) + unsorted_function_hints: Dict[str, List[str]] = collections.defaultdict(list) unsorted_function_hints.update({ 'set_flush_denormal': ['def set_flush_denormal(mode: _bool) -> _bool: ...'], 'get_default_dtype': ['def get_default_dtype() -> _dtype: ...'], @@ -560,21 +382,13 @@ def gen_pyi(declarations_path, out): ' other: Union[Tensor, Number],' ' *, alpha: Optional[Number]=1, out: Optional[Tensor]=None) -> Tensor: ...'.format(binop)) - function_declarations = get_py_torch_functions(declarations) - for name in sorted(function_declarations.keys()): - unsorted_function_hints[name] += generate_type_hints(name, function_declarations[name], namedtuples) - - # Generate type signatures for deprecated functions - - # TODO: Maybe we shouldn't generate type hints for deprecated - # functions :) However, examples like those addcdiv rely on these. - with open('tools/autograd/deprecated.yaml', 'r') as f: - deprecated = yaml.load(f, Loader=YamlLoader) - for d in deprecated: - name, sig = re.match(r"^([^\(]+)\(([^\)]*)", d['name']).groups() - sig = ['*' if p.strip() == '*' else p.split() for p in sig.split(',')] - sig = ['*' if p == '*' else (p[1] + ': ' + type_to_python(p[0])) for p in sig] - unsorted_function_hints[name].append("def {}({}) -> Tensor: ...".format(name, ', '.join(sig))) + function_signatures = load_signatures(native_yaml_path, deprecated_yaml_path, method=False, pyi=True) + sig_groups = get_py_torch_functions(function_signatures) + for name in sorted(sig_groups.keys()): + unsorted_function_hints[name] += generate_type_hints(sig_groups[name]) + # deprecated signatures are not used when computing named tuples + native_groups = [g for g in sig_groups[name] if not g.signature.deprecated] + namedtuples.update(generate_named_tuples(native_groups)) function_hints = [] for name, hints in sorted(unsorted_function_hints.items()): @@ -585,26 +399,26 @@ def gen_pyi(declarations_path, out): # Generate type signatures for Tensor methods # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - unsorted_tensor_method_hints = collections.defaultdict(list) + unsorted_tensor_method_hints: Dict[str, List[str]] = collections.defaultdict(list) unsorted_tensor_method_hints.update({ 'size': ['def size(self) -> Size: ...', 'def size(self, _int) -> _int: ...'], 'stride': ['def stride(self) -> Tuple[_int]: ...', 'def stride(self, _int) -> _int: ...'], - 'new_ones': ['def new_ones(self, size: {}, {}) -> Tensor: ...'. - format(type_to_python('IntArrayRef'), FACTORY_PARAMS)], + 'new_ones': ['def new_ones(self, size: _size, {}) -> Tensor: ...'. + format(FACTORY_PARAMS)], 'new_tensor': ["def new_tensor(self, data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)], # new and __init__ have the same signatures differ only in return type # Adapted from legacy_tensor_ctor and legacy_tensor_new 'new': ['def new(self, *args: Any, {}) ->Tensor: ...'.format(DEVICE_PARAM), 'def new(self, storage: Storage) -> Tensor: ...', 'def new(self, other: Tensor) -> Tensor: ...', - 'def new(self, size: {}, *, {}) -> Tensor: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM), + 'def new(self, size: _size, *, {}) -> Tensor: ...'.format(DEVICE_PARAM), ], '__init__': ['def __init__(self, *args: Any, {}) -> None: ...'.format(DEVICE_PARAM), 'def __init__(self, storage: Storage) -> None: ...', 'def __init__(self, other: Tensor) -> None: ...', - 'def __init__(self, size: {}, *, {}) -> None: ...'.format(type_to_python('IntArrayRef'), DEVICE_PARAM), + 'def __init__(self, size: _size, *, {}) -> None: ...'.format(DEVICE_PARAM), ], 'as_subclass': ["def as_subclass(self, cls: Tensor) -> Tensor: ..."], # clamp has no default values in the Declarations @@ -679,10 +493,14 @@ def gen_pyi(declarations_path, out): for name in simple_conversions: unsorted_tensor_method_hints[name].append('def {}(self) -> Tensor: ...'.format(name)) - tensor_method_declarations = get_py_variable_methods(declarations) - for name in sorted(tensor_method_declarations.keys()): - unsorted_tensor_method_hints[name] += \ - generate_type_hints(name, tensor_method_declarations[name], namedtuples, is_tensor=True) + # pyi tensor methods don't currently include deprecated signatures for some reason + # TODO: we should probably add them in + tensor_method_signatures = load_signatures(native_yaml_path, deprecated_yaml_path, method=True, skip_deprecated=True, pyi=True) + tensor_method_sig_groups = get_py_torch_functions(tensor_method_signatures, method=True) + + for name in sorted(tensor_method_sig_groups.keys()): + unsorted_tensor_method_hints[name] += generate_type_hints(tensor_method_sig_groups[name], is_tensor=True) + namedtuples.update(generate_named_tuples(tensor_method_sig_groups[name])) for op in all_ops: name = '__{}__'.format(op) @@ -764,17 +582,20 @@ def gen_pyi(declarations_path, out): gen_nn_pyi(out) -def main(): +def main() -> None: parser = argparse.ArgumentParser( description='Generate type stubs for PyTorch') - parser.add_argument('--declarations-path', metavar='DECL', - default='torch/share/ATen/Declarations.yaml', - help='path to Declarations.yaml') + parser.add_argument('--native-functions-path', metavar='NATIVE', + default='aten/src/ATen/native/native_functions.yaml', + help='path to native_functions.yaml') + parser.add_argument('--deprecated-functions-path', metavar='DEPRECATED', + default='tools/autograd/deprecated.yaml', + help='path to deprecated.yaml') parser.add_argument('--out', metavar='OUT', default='.', help='path to output directory') args = parser.parse_args() - gen_pyi(args.declarations_path, args.out) + gen_pyi(args.native_functions_path, args.deprecated_functions_path, args.out) if __name__ == '__main__': diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index bcc847e825ad..9b1d6fd4a55f 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -234,9 +234,9 @@ add_custom_command( "${TORCH_SRC_DIR}/nn/functional.pyi" COMMAND "${PYTHON_EXECUTABLE}" -mtools.pyi.gen_pyi - --declarations-path "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" + --native-functions-path "aten/src/ATen/native/native_functions.yaml" + --deprecated-functions-path "tools/autograd/deprecated.yaml" DEPENDS - "${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml" "${TORCH_SRC_DIR}/_C/__init__.pyi.in" "${TORCH_SRC_DIR}/_C/_VariableFunctions.pyi.in" "${TORCH_SRC_DIR}/nn/functional.pyi.in" From dad74e58fcbe35a4409ecec5d816ce54c6986358 Mon Sep 17 00:00:00 2001 From: Iurii Zdebskyi Date: Mon, 7 Dec 2020 10:43:56 -0800 Subject: [PATCH 011/250] [WIP] Added foreach_trunc, foreahc_reciprocal, foreach_sigmoid APIs (#47385) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47385 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D24737051 Pulled By: izdeby fbshipit-source-id: ed259d9184b2b784d8cc1983a8b85cc6cbf930ba --- aten/src/ATen/native/ForeachOpsKernels.cpp | 3 + aten/src/ATen/native/cuda/ForeachUnaryOp.cu | 180 +++++++++++++++++++- aten/src/ATen/native/native_functions.yaml | 48 ++++++ test/test_foreach.py | 18 +- tools/codegen/model.py | 3 + 5 files changed, 239 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 5fbc1506bfaa..24ab10b25f84 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -188,6 +188,9 @@ FOREACH_UNARY_OP(sinh); FOREACH_UNARY_OP(round); FOREACH_UNARY_OP(lgamma); FOREACH_UNARY_OP(frac); +FOREACH_UNARY_OP(trunc); +FOREACH_UNARY_OP(reciprocal); +FOREACH_UNARY_OP(sigmoid); FOREACH_POINTWISE_OP_SCALAR(addcdiv); FOREACH_POINTWISE_OP_SCALAR(addcmul); diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu index 2cd01d80bdca..88b952fe1d95 100644 --- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu +++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu @@ -417,9 +417,9 @@ std::vector foreach_tensor_frac_cuda(TensorList tensors) { using opmath_t = get_opmath_t::opmath_t; multi_tensor_apply<2>(tensor_lists, UnaryOpFunctor(), + /* depth */ 2, + /* r_args_depth */ 1, + /* res_arg_index */ 1>(), Trunc()); }); return tensor_lists[1]; @@ -439,10 +439,178 @@ void foreach_tensor_frac_cuda_(TensorList tensors) { using opmath_t = get_opmath_t::opmath_t; multi_tensor_apply<1>(tensor_lists, UnaryOpFunctor(), + /* depth */ 1, + /* r_args_depth */ 1, + /* res_arg_index */ 0>(), Trunc()); }); } + +template +struct Sigmoid { + T one = T(1); + __device__ T operator()(T t) const { return (one / (one + std::exp(-t))); } +}; + +std::vector foreach_tensor_sigmoid_cuda(TensorList tensors) { + check_foreach_api_restrictions(tensors); + + if (!can_use_fast_route(tensors)) { + return at::native::foreach_tensor_sigmoid_slow(tensors); + } + + std::vector> tensor_lists; + std::vector vec_res; + vec_res.reserve(tensors.size()); + for (const auto& t: tensors) { + vec_res.emplace_back(at::native::empty_like(t)); + } + + tensor_lists.emplace_back(tensors.vec()); + tensor_lists.emplace_back(std::move(vec_res)); + + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + UnaryOpFunctor(), + Sigmoid()); + }); + return tensor_lists[1]; +} + +void foreach_tensor_sigmoid_cuda_(TensorList tensors) { + check_foreach_api_restrictions(tensors); + + if (!can_use_fast_route(tensors)) { + return at::native::foreach_tensor_sigmoid_slow_(tensors); + } + + std::vector> tensor_lists; + tensor_lists.emplace_back(tensors.vec()); + + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<1>(tensor_lists, + UnaryOpFunctor(), + Sigmoid()); + }); +} + +template +struct Reciprocal { + T one = T(1); + __device__ T operator()(T t) const { return (one / t); } +}; + +std::vector foreach_tensor_reciprocal_cuda(TensorList tensors) { + check_foreach_api_restrictions(tensors); + + if (!can_use_fast_route(tensors)) { + return at::native::foreach_tensor_reciprocal_slow(tensors); + } + + std::vector> tensor_lists; + std::vector vec_res; + vec_res.reserve(tensors.size()); + for (const auto& t: tensors) { + vec_res.emplace_back(at::native::empty_like(t)); + } + + tensor_lists.emplace_back(tensors.vec()); + tensor_lists.emplace_back(std::move(vec_res)); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + UnaryOpFunctor(), + Reciprocal()); + }); + return tensor_lists[1]; +} + +void foreach_tensor_reciprocal_cuda_(TensorList tensors) { + check_foreach_api_restrictions(tensors); + + if (!can_use_fast_route(tensors)) { + return at::native::foreach_tensor_reciprocal_slow_(tensors); + } + + std::vector> tensor_lists; + tensor_lists.emplace_back(tensors.vec()); + + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<1>(tensor_lists, + UnaryOpFunctor(), + Reciprocal()); + }); +} + +template +struct Truncf { + __device__ T operator()(T t) const { return std::trunc(t); } +}; + +std::vector foreach_tensor_trunc_cuda(TensorList tensors) { + check_foreach_api_restrictions(tensors); + + if (!can_use_fast_route(tensors)) { + return at::native::foreach_tensor_trunc_slow(tensors); + } + + std::vector> tensor_lists; + std::vector vec_res; + vec_res.reserve(tensors.size()); + for (const auto& t: tensors) { + vec_res.emplace_back(at::native::empty_like(t)); + } + + tensor_lists.emplace_back(tensors.vec()); + tensor_lists.emplace_back(std::move(vec_res)); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<2>(tensor_lists, + UnaryOpFunctor(), + Truncf()); + }); + return tensor_lists[1]; +} + +void foreach_tensor_trunc_cuda_(TensorList tensors) { + check_foreach_api_restrictions(tensors); + + if (!can_use_fast_route(tensors)) { + return at::native::foreach_tensor_trunc_slow_(tensors); + } + + std::vector> tensor_lists; + tensor_lists.emplace_back(tensors.vec()); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { + using opmath_t = get_opmath_t::opmath_t; + multi_tensor_apply<1>(tensor_lists, + UnaryOpFunctor(), + Truncf()); + }); +} + }} // namespace at::native diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 4d8ea72761af..e7ac20599214 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -7610,6 +7610,54 @@ CPU: foreach_tensor_frac_slow_ CUDA: foreach_tensor_frac_cuda_ +- func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_reciprocal_slow + CUDA: foreach_tensor_reciprocal_cuda + +- func: _foreach_reciprocal_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_reciprocal_slow_ + CUDA: foreach_tensor_reciprocal_cuda_ + +- func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sigmoid_slow + CUDA: foreach_tensor_sigmoid_cuda + +- func: _foreach_sigmoid_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_sigmoid_slow_ + CUDA: foreach_tensor_sigmoid_cuda_ + +- func: _foreach_trunc(Tensor[] tensors) -> Tensor[] + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_trunc_slow + CUDA: foreach_tensor_trunc_cuda + +- func: _foreach_trunc_(Tensor(a!)[] self) -> () + use_c10_dispatcher: full + device_guard: False + variants: function + dispatch: + CPU: foreach_tensor_trunc_slow_ + CUDA: foreach_tensor_trunc_cuda_ + - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () use_c10_dispatcher: full device_guard: False diff --git a/test/test_foreach.py b/test/test_foreach.py index eff6d969c5e5..c55c4e71dab0 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -53,6 +53,9 @@ class TestForeach(TestCase): (torch._foreach_log1p, torch._foreach_log1p_, torch.log1p, True, False), (torch._foreach_round, torch._foreach_round_, torch.round, False, False), (torch._foreach_frac, torch._foreach_frac_, torch.frac, False, False), + (torch._foreach_reciprocal, torch._foreach_reciprocal_, torch.reciprocal, True, True), + (torch._foreach_sigmoid, torch._foreach_sigmoid_, torch.sigmoid, True, False), + (torch._foreach_trunc, torch._foreach_trunc_, torch.trunc, False, False), # See test_abs # (torch._foreach_abs, torch._foreach_abs_, torch.abs, True, True), @@ -173,7 +176,7 @@ def test_unary_ops(self, device, dtype): control_dtype = torch.float32 if (self.device_type == 'cuda' and (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype - if self.device_type == 'cpu' and dtype == torch.half and torch_op not in [torch.neg, torch.frac]: + if self.device_type == 'cpu' and dtype == torch.half and torch_op not in [torch.neg, torch.frac, torch.reciprocal]: with self.assertRaisesRegex(RuntimeError, r"not implemented for \'Half\'"): expected = [torch_op(tensors1[i]) for i in range(N)] @@ -191,13 +194,14 @@ def test_unary_ops(self, device, dtype): break if dtype in [torch.complex64, torch.complex128] and not support_complex: - # not using assertRaisesRegex due to different error messages - with self.assertRaises(RuntimeError): - expected = [torch_op(tensors1[i]) for i in range(N)] + if not (self.device_type == 'cpu' and torch_op in [torch.sigmoid]): + # not using assertRaisesRegex due to different error messages + with self.assertRaises(RuntimeError): + expected = [torch_op(tensors1[i]) for i in range(N)] - with self.assertRaises(RuntimeError): - res = fe_op(tensors1) - break + with self.assertRaises(RuntimeError): + res = fe_op(tensors1) + break expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)] res = fe_op(tensors1) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index f270d0737ade..87cd3ab8e302 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -466,6 +466,9 @@ def __post_init__(self) -> None: '_foreach_round_', '_foreach_lgamma_', '_foreach_frac_', + '_foreach_reciprocal_', + '_foreach_sigmoid_', + '_foreach_trunc_', '_foreach_addcmul_.Scalar', '_foreach_addcdiv_.Scalar', '_foreach_addcmul_.ScalarList', From 924b001b71555cfd58b31249a2eb7963627f2fc8 Mon Sep 17 00:00:00 2001 From: Hui Guo Date: Mon, 7 Dec 2020 11:12:08 -0800 Subject: [PATCH 012/250] #48733 added logging statements to LLVM codegen using JIT logging (#48758) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48758 Test Plan: PYTORCH_JIT_LOG_LEVEL=">>llvm_codegen" python test/test_jit_fuser_te.py -k test_lerp Reviewed By: ZolotukhinM Differential Revision: D25295995 Pulled By: huiguoo fbshipit-source-id: 8927808932ef3657da26508d0f6574c9e5fbbb25 --- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 35929a61266f..509015f7ffa5 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -30,6 +30,7 @@ #include #include +#include #define DEBUG_PRINT 0 using namespace torch::jit::tensorexpr; @@ -518,6 +519,13 @@ void LLVMCodeGenImpl::emitKernel( if (llvm::verifyFunction(*fn_, &llvm::outs())) { throw std::runtime_error("Function verification failed"); } + + // print graph debug info. + std::string fnstr; + llvm::raw_string_ostream FS(fnstr); + fn_->print(FS); + GRAPH_DEBUG("LLVM Function:\n", FS.str(), "\n"); + optimize(*module_); #if DEBUG_PRINT From d307601365c3b848072b8b8381208aedc1a0aca5 Mon Sep 17 00:00:00 2001 From: Heitor Schueroff Date: Mon, 7 Dec 2020 11:46:58 -0800 Subject: [PATCH 013/250] Revert D24923679: Fixed einsum compatibility/performance issues (#46398) Test Plan: revert-hammer Differential Revision: D24923679 (https://github.com/pytorch/pytorch/commit/ea2a568cca71aaf690051782c225ca9dd2e5e1f9) Original commit changeset: 47e48822cd67 fbshipit-source-id: 52f17b66a4aa075d0159bdf1c98616e6098091b8 --- aten/src/ATen/native/Linear.cpp | 501 +++++++++++++------------------- test/test_linalg.py | 219 +++++--------- torch/functional.py | 171 +++++------ 3 files changed, 348 insertions(+), 543 deletions(-) diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index bac2f80e8a7c..c9e03aaa3b6b 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -136,334 +136,241 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra return result; } -// There are roughly three parts to compute einsum: -// 1. Parse equation to extract the labels for each input operand and output -// 2. Unsqueeze missing dimensions from input operands and permute to align them -// 3. Compute result by multiplying input operands and summing contraction -// dimensions We do the last part by reducing to bmm. -Tensor einsum(std::string equation, TensorList operands) { - TORCH_CHECK(!operands.empty(), "einsum() must provide at least one operand"); - checkDeviceType("einsum()", operands, operands[0].device().type()); - - // Code for encoding ellipsis ("...") with labels - constexpr int ELLIPSIS = '.'; - - // Find arrow (->) to split equation into lhs and rhs - const auto arrow_pos = equation.find("->"); - const auto lhs = equation.substr(0, arrow_pos); - - // Convert labels for input operands into an index in [0, 25] and store - // them in op_labels for each operand along with ELLIPSIS. - std::vector> op_labels(operands.size()); - bool found_ell = false; - std::string::size_type curr_op = 0; - for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) { - switch (lhs[i]) { - case ' ': - // Ignore spaces - break; - - case '.': - TORCH_CHECK( - // Only one ellipsis per operand can be given - !found_ell, - "einsum() found \'.\' for operand ", - curr_op, - " for which an ellipsis was already found"); - TORCH_CHECK( - // Ensure it's a valid ellipsis - i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.', - "einsum() found \'.\' for operand ", - curr_op, - " that is not part of any ellipsis"); - op_labels[curr_op].push_back(ELLIPSIS); - found_ell = true; - break; - - case ',': - // Move onto next operand - ++curr_op; - TORCH_CHECK( - curr_op < operands.size(), - "einsum() fewer operands were provided than specified in the equation"); - found_ell = false; - break; - - default: - // Parse label - TORCH_CHECK( - lhs[i] >= 'a' && lhs[i] <= 'z', - "einsum() operand subscript must be in range [a, z] but found ", - lhs[i], - " for operand ", - curr_op); - // Convert label to index in [0, 25] and store - op_labels[curr_op].push_back(lhs[i] - 'a'); - } +Tensor einsum(std::string eqn, TensorList tensors) { + constexpr size_t number_of_letters = 26; + std::string in_eqn; + size_t pos; + // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis. + // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter + // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices. + // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that + // the letter has not been assigned an index yet (because it has not been seen). + // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices). + // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet. + // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below. + + std::array letter_mapping; // map letter to internal (numerical) label + letter_mapping.fill(-1); + int64_t num_ell_idxes = -1; + int64_t first_ell_idx = 0; + + // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes. + // For each operand, we have a vector mapping each dimension to an internal index. + // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and + // of the last occurrence of each index. + std::vector> input_op_idxes; // the parsed operand indices + std::array num_letter_occurrences; // number of occurrence in the equation of this letter + num_letter_occurrences.fill(0); + std::vector last_idx_occurrence; // the last operator (left to right) using this index + + if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side + in_eqn = eqn.substr(0, pos); + } else { + in_eqn = eqn; } - - TORCH_CHECK( - curr_op == operands.size() - 1, - "einsum() more operands were provided than specified in the equation"); - - // Labels must be within [a, z]. - constexpr int TOTAL_LABELS = 'z' - 'a' + 1; - std::vector label_count(TOTAL_LABELS, 0); - - // The maximum number of dimensions covered by any ellipsis, needed when - // unsqueezing missing dimensions from operands to permute and broadcast - int64_t ell_num_dim = 0; - - // Compute label frequency and number of dimensions covered by ellipsis - // We do this after parsing labels to make it more readable and simpler - // to compute the number of dimensions covered by ellipsis. - for (std::size_t i = 0; i < operands.size(); ++i) { - const Tensor operand = operands[i]; - std::vector labels = op_labels[i]; - int64_t nlabels = labels.size(); - int64_t ndims = operand.dim(); - bool has_ellipsis = false; - - for (int label : labels) { - if (label == ELLIPSIS) { - --nlabels; - has_ellipsis = true; - ell_num_dim = std::max(ell_num_dim, ndims - nlabels); - } else { - ++label_count[label]; + // remove spaces for einsum compatibility (#9929) + in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end()); + + // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index + int64_t operand = 0; + std::stringstream eqn_stream(in_eqn); + std::string term; + int64_t num_total_idxes = 0; + while (! eqn_stream.eof()) { + std::getline(eqn_stream, term, ','); // term = string with indices of current term + TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension + + int64_t ell_char_count = 0; // handling of ellipsis '...' is a bit tedious, we count the '.' + // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions + int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3; + int64_t dims_in_term = 0; // dimensions we have seen + std::vector current_op_idxes; // mapping of operand dimensions to indices for current term + for (auto &c : term) { // c = character with a single letter or '.' + if (c == '.') { + ell_char_count++; + TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation"); + if (ell_char_count == 3) { // this completes the ellipsis + if (num_ell_idxes == -1) { // if we have not seen an ellipsis before, keep track of indices and size + first_ell_idx = num_total_idxes; + num_ell_idxes = candidate_num_ell_idxes; + num_total_idxes += num_ell_idxes; + } + else { // we have seen an ellipsis before, so we check compatibility + TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes, + "ellipsis must represent ", num_ell_idxes, " dimensions in all terms"); + } + for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices + current_op_idxes.push_back(first_ell_idx + i); + last_idx_occurrence.push_back(operand); + } + dims_in_term += num_ell_idxes; // keep track of dimensions + } + } else { // a letter (hopefully) + TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand); + TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); + int64_t letter_num = c-'a'; // letter_num = position in letter_mapping + if (letter_mapping[letter_num] == -1) { // new letter, add internal index and mapping + letter_mapping[letter_num] = num_total_idxes; + num_total_idxes++; + last_idx_occurrence.push_back(operand); + } else { // letter we have already seen + last_idx_occurrence[letter_mapping[letter_num]] = operand; + } + num_letter_occurrences[letter_num]++; + current_op_idxes.push_back(letter_mapping[letter_num]); + dims_in_term++; } } - - TORCH_CHECK( - has_ellipsis ? nlabels <= ndims : nlabels == ndims, - "einsum() the number of subscripts in the equation (", - nlabels, - has_ellipsis ? ") is more than the number of dimensions (" - : ") does not match the number of dimensions (", - ndims, - ") for operand ", - i, - has_ellipsis ? "" : " and no ellipsis was given"); + TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim()); + input_op_idxes.push_back(std::move(current_op_idxes)); + operand++; } - - // Mapping of label to index in the permuted tensors (out_dims + sum_dims) - // This will be used for aligning the dimensions of all input operands - std::vector label_perm_index(TOTAL_LABELS, -1); - - // Current index in the permuted shape - int perm_index = 0; - - // Start index of ellipsis dimensions in the permuted shape - int64_t ell_index = 0; - found_ell = false; - - if (arrow_pos == std::string::npos) { - // Implicit output is ellipsis (...) + labels seen only once - perm_index = ell_num_dim; - found_ell = true; - for (int label = 0; label < TOTAL_LABELS; ++label) { - if (label_count[label] == 1) { - label_perm_index[label] = perm_index++; + // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <. + TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation"); + + // the following parses or infers output (right hand side) + // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) + // for the output indices. -1 means that the index has not been assigned a dimension yet + std::vector idxes_to_preprocessed_dims(num_total_idxes, -1); // the position of the index in the tensor dimensions + int64_t num_output_dims = 0; + if (pos != std::string::npos) { // parse the user provided right hand side + int64_t ell_char_count = 0; + for (auto &c : eqn.substr(pos+2)) { + if (c == '.') { // '.' as part of ellipsis + ell_char_count++; + TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation"); + if (ell_char_count == 3) { // ellipsis complete + TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side"); + for (int64_t i = 0; i < num_ell_idxes; ++i) { + idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; + num_output_dims++; + } + } + } else if (! isspace(c)) { // letter (hopefully) + TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); + TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); + int64_t letter_num = c-'a'; + TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, " occurs twice in output"); + idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims; + num_output_dims++; } } - } else { - // Parse explicit output - const std::string rhs = equation.substr(arrow_pos + 2); - for (std::size_t i = 0; i < rhs.length(); ++i) { - switch (rhs[i]) { - case ' ': - // Ignore spaces - break; - - case '.': - TORCH_CHECK( - // There can only be one ellipsis in the output - !found_ell, - "einsum() found \'.\' for output but an ellipsis (...) was already found"); - TORCH_CHECK( - // Ensure ellipsis is correct - i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.', - "einsum() found \'.\' for output that is not part of any ellipsis (...)"); - ell_index = perm_index; - perm_index += ell_num_dim; - found_ell = true; - break; - - default: - TORCH_CHECK( - rhs[i] >= 'a' && rhs[i] <= 'z', - "einsum() subscripts must be in range [a, z] but found ", - rhs[i], - " for the output"); - TORCH_CHECK( - // Ensure label appeared at least once for some input operand and at - // most once for the output - label_count[rhs[i] - 'a'] > 0, - "einsum() output subscript ", - rhs[i], - label_count[rhs[i] - 'a'] == -1 - ? " appears more than once in the output" - : " does not appear in the equation for any input operand"); - label_perm_index[rhs[i] - 'a'] = perm_index++; - - // Set to -1 to mark that this label already appeared in the output - label_count[rhs[i] - 'a'] = -1; + } else { // create an inferred right hand side + // the ellipsis (if in the lhs) comes first + if (num_ell_idxes >= 0) { + for (int64_t i = 0; i < num_ell_idxes; ++i) { + idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; + num_output_dims++; + } + } + // then the indices that occur exactly once in alphabetic order + for (size_t idx = 0; idx < number_of_letters; idx++) { + if (num_letter_occurrences[idx] == 1) { + idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims; + num_output_dims++; } } } - - // Save output size before adding sum dims - const int out_size = perm_index; - - // If ellipsis is not part of the output, add to contraction dimensions - if (ell_num_dim > 0 && !found_ell) { - ell_index = perm_index; - perm_index += ell_num_dim; - } - - // Add contraction labels (labels not present in output) - for (int label = 0; label < TOTAL_LABELS; ++label) { - if (label_count[label] > 0 && label_perm_index[label] == -1) { - label_perm_index[label] = perm_index++; + // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) + // for the non-output indices - those that are eventually summed over + int64_t position = num_output_dims; + for (int64_t i = 0; i < num_total_idxes; i++) { + if (idxes_to_preprocessed_dims[i]==-1) { + idxes_to_preprocessed_dims[i] = position; + position++; } } - // Here we unsqueeze missing dimensions to make all operands have the same - // number of dimensions. We take diagonals for repeated labels within the - // same operand. Finally we permute the operands to align dimensions as - // per the perm_out_index we computed above. - std::vector permuted_operands; - for (std::size_t i = 0; i < operands.size(); ++i) { - std::vector perm_shape(perm_index, -1); - std::vector label_dim(TOTAL_LABELS, -1); - const std::vector labels = op_labels[i]; - Tensor operand = operands[i]; - const auto sizes = operand.sizes(); - std::size_t j = 0; - - for (int label : labels) { - if (label == ELLIPSIS) { - // Add missing dimensions under ellipsis - int64_t num_dim_diff = - ell_num_dim - (operand.dim() - labels.size() + 1); - for (int64_t k = 0; k < num_dim_diff; ++k) { - operand = operand.unsqueeze(j); + // we now "homogenize the dimensions", i.e. + // - take diagonals for duplicated indices + // - permute the dimensions to match the order given by idxes_to_preprocessed_dims + // - unsqueeze to create all dimensions for each index in each tensor where they are missing + // we also check that sizes match + // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable) + std::vector preprocessed_operands; + std::vector size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet + for (int64_t op = 0; op < (int64_t) tensors.size(); op++) { + auto preprocessed_op = tensors[op]; + std::vector idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear + std::vector& current_op_input_idxes = input_op_idxes[op]; + int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input + for (size_t i = 0; i < current_op_input_idxes.size(); i++) { + auto idx = current_op_input_idxes[i]; + auto dim_out = idxes_to_preprocessed_dims[idx]; + if (idx_to_dim[dim_out] == -1) { // first appearance + idx_to_dim[dim_out] = dim; + if (size_of_dims[idx] == -1) { // keep track of sizes + size_of_dims[idx] = preprocessed_op.size(dim); + } + else { + TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); } - for (int64_t k = 0; k < ell_num_dim; ++k) { - perm_shape[ell_index + k] = j++; + dim++; + } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out] + TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); + preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim); + // diagonal moves the diagonal dimension to the back + // now we permute the last dim back to idx_to_dim[dim_out] + std::vector perm(preprocessed_op.dim(), 0); + for (int64_t d = 0; d < preprocessed_op.dim(); d++) { + if (d == idx_to_dim[dim_out]) { + perm[d] = preprocessed_op.dim() - 1; + } else { + perm[d] = d - (d > idx_to_dim[dim_out]); + } } - } else if (label_dim[label] != -1) { - // Repeated label, take diagonal - int64_t dim = label_dim[label]; - TORCH_CHECK( - sizes[j] == sizes[dim], - "einsum() subscript ", - char(label + 'a'), - " is repeated for operand ", - i, - " but the sizes don't match, ", - sizes[j], - " != ", - sizes[dim]); - operand = operand.diagonal(0, j, dim).movedim(-1, dim); - } else { - // Lookup output index for label - label_dim[label] = j; - perm_shape[label_perm_index[label]] = j++; + preprocessed_op = preprocessed_op.permute(perm); } } - - // Add dimensions for missing labels - for (int64_t& index : perm_shape) { - if (index == -1) { - operand = operand.unsqueeze(-1); - index = j++; + // now we permute the dimensions in the right order + std::vector permutation; // permutation for this tensor + for (auto &d : idx_to_dim) { + if (d > -1) { + permutation.push_back(d); } } - - permuted_operands.push_back(operand.permute(perm_shape)); - } - - // Check if operands broadcast and keep track of last operand with - // dimension size != 1 for optimizing reductions - std::vector dim_last_op(perm_index, 0); - bool has_zero_size_dim = false; - for (int dim = 0; dim < perm_index; ++dim) { - int64_t broadcast_size = permuted_operands[0].size(dim); - for (std::size_t i = 1; i < permuted_operands.size(); ++i) { - int64_t dim_size = permuted_operands[i].size(dim); - if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) { - std::ostringstream msg; - msg << "einsum() operands do not broadcast with remapped shapes [original->remapped]:"; - for (std::size_t j = 0; j < operands.size(); ++j) { - msg << " " << operands[j].sizes() << "->" - << permuted_operands[j].sizes(); - } - TORCH_CHECK(false, msg.str()); - } - if (dim_size != 1) { - broadcast_size = dim_size; - dim_last_op[dim] = i; + preprocessed_op = preprocessed_op.permute(permutation); + // finally, we insert dimensions for idxes not in the operand + for (size_t dim = 0; dim < idx_to_dim.size(); dim++) { + if (idx_to_dim[dim] == -1) { + preprocessed_op = preprocessed_op.unsqueeze(dim); } } - has_zero_size_dim |= broadcast_size == 0; - } - - // Compute result - Tensor result = permuted_operands[0]; - // Fast path for when an operand has zero sized dim - if (has_zero_size_dim) { - std::vector out_shape(out_size); - for (int i = 0; i < out_size; ++i) { - out_shape[i] = permuted_operands[dim_last_op[i]].size(i); - } - return at::zeros(out_shape, result.options()); + preprocessed_operands.push_back(std::move(preprocessed_op)); } - // Sum out or squeeze dimensions that are size 1 for all later operands - int dim = out_size; - for (int i = dim; i < perm_index; ++i, ++dim) { - if (dim_last_op[i] == 0) { - if (result.size(dim) == 1) { - result = result.squeeze(dim--); - } else { - result = result.sum(dim--); - } + // now we reduce the indices from left to right + // numpy allows to optimize the path using various + // algorithms (see eigen_path in numpy docs) + // we start with the leftmost operator and reduce indices that + // appear only there + Tensor result = std::move(preprocessed_operands[0]); + for (int64_t idx = 0; idx < num_total_idxes; idx++) { + if ((last_idx_occurrence[idx] == 0) + && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) { + result = result.sum(idxes_to_preprocessed_dims[idx], true); } } - for (std::size_t i = 1; i < permuted_operands.size(); ++i) { - Tensor operand = permuted_operands[i]; + // now we process each tensor using sumproduct_pair + for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) { std::vector sum_dims; - - // Sum out or squeeze dimensions that are size 1 for all later operands - dim = out_size; - for (int j = dim; j < perm_index; ++j, ++dim) { - if (dim_last_op[j] < i) { - operand = operand.squeeze(dim); - --dim; - } else if (dim_last_op[j] == i) { - if (result.size(dim) == 1) { - operand = operand.sum(dim); - result = result.squeeze(dim); - --dim; - } else { - sum_dims.push_back(dim); - } + for (int64_t idx = 0; idx < num_total_idxes; idx++) { + if ((last_idx_occurrence[idx] == i) + && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) { + sum_dims.push_back(idxes_to_preprocessed_dims[idx]); } } - - // Multiply tensors and sum out dimensions in sum_dims - if (sum_dims.empty()) { - result = result.mul(operand); - } else if (sum_dims.size() == result.sizes().size()) { - result = result.flatten().dot(operand.flatten()); - } else { - result = sumproduct_pair(result, operand, sum_dims, false); - } + result = at::native::sumproduct_pair(result, std::move(preprocessed_operands[i]), sum_dims, true); + } + // finally, we squeeze out all non-result dimensions + auto sizes = result.sizes().vec(); + for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) { + sizes.erase(sizes.begin() + dim); } + result = result.view(sizes); return result; } diff --git a/test/test_linalg.py b/test/test_linalg.py index 3fa677d2b1de..b6ff817a59fa 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -2588,151 +2588,6 @@ def test_old_matrix_rank(self, device, dtype): self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True)) self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True)) - @dtypes(torch.double) - def test_einsum(self, device, dtype): - def check(equation, *operands): - ref = np.einsum(equation, *[operand.cpu().numpy() for operand in operands]) - res = torch.einsum(equation, operands) - self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref))) - - # Check autograd - ops = [op.detach().requires_grad_() for op in operands] - self.assertTrue(torch.autograd.gradcheck(lambda *ops: torch.einsum(equation, ops), ops)) - for op in ops: - self.assertTrue(op._version == 0) - - # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f - x = torch.rand(5, device=device, dtype=dtype) - y = torch.rand(7, device=device, dtype=dtype) - A = torch.randn(3, 5, device=device, dtype=dtype) - B = torch.randn(2, 5, device=device, dtype=dtype) - C = torch.randn(2, 3, 5, device=device, dtype=dtype) - D = torch.randn(2, 5, 7, device=device, dtype=dtype) - E = torch.randn(7, 9, device=device, dtype=dtype) - F = torch.randn(2, 3, 3, 5, device=device, dtype=dtype) - G = torch.randn(5, 4, 6, device=device, dtype=dtype) - H = torch.randn(4, 4, device=device, dtype=dtype) - I = torch.rand(2, 3, 2, device=device, dtype=dtype) - - # Note: gradcheck fails if the same input is given multiple times which is why the - # calls to clone below. (see https://github.com/pytorch/pytorch/issues/9282) - - # Vector operations - check('i->', x) # sum - check('i,i->', x, x.clone()) # dot - check('i,i->i', x, x.clone()) # vector element-wisem mul - check('i,j->ij', x, y) # outer - - # Matrix operations - check("ij->ji", A) # transpose - check("ij->j", A) # row sum - check("ij->i", A) # col sum - check("ij,ij->ij", A, A.clone()) # matrix element-wise mul - check("ij,j->i", A, x) # matrix vector multiplication - check("ij,kj->ik", A, B) # matmul - check("ij,ab->ijab", A, E) # matrix outer product - - # Tensor operations - check("aij,ajk->aik", C, D) # batch matmul - check("ijk,jk->i", C, A) # tensor matrix contraction - check("aij,jk->aik", D, E) # tensor matrix contraction - check("abcd,dfg->abcfg", F, G) # tensor tensor contraction - check("ijk,jk->ik", C, A) # tensor matrix contraction with double indices - check("ijk,jk->ij", C, A) # tensor matrix contraction with double indices - check("ijk,ik->j", C, B) # non contiguous - check("ijk,ik->jk", C, B) # non contiguous with double indices - - # Test diagonals - check("ii", H) # trace - check("ii->i", H) # diagonal - check('iji->j', I) # non-contiguous trace - - # Test ellipsis - check("i...->...", H) - check("ki,...k->i...", A.t(), B) - check("k...,jk->...", A.t(), B) - check('...ik, ...j -> ...ij', C, x) - check('bik,k...j->i...j', C, torch.rand(5, 3, device=device, dtype=dtype)) - check('i...j, ij... -> ...ij', C, torch.rand(2, 5, 2, 3, device=device, dtype=dtype)) - - # torch.bilinear with discontiguous tensors - l = torch.randn(10, 5, device=device, dtype=dtype).transpose(0, 1) - r = torch.randn(20, 5, device=device, dtype=dtype).transpose(0, 1) - w = torch.randn(15, 10, 20, device=device, dtype=dtype) - check("bn,anm,bm->ba", l, w, r) - - # with strided tensors - check("bn,anm,bm->ba", l[:, ::2], w[:, ::2, ::2], r[:, ::2]) - - def test_einsum_corner_cases(self, device): - def check(equation, *operands, expected_output): - tensors = [torch.tensor(operand, dtype=torch.float32, device=device) if not isinstance(operand, tuple) - else torch.rand(operand, dtype=torch.float32, device=device) for operand in operands] - output = torch.einsum(equation, tensors) - self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device)) - - # Test equation variantions - check(' ', 1, expected_output=1) - check(' -> ', 1, expected_output=1) - check(' , ', 2, 2, expected_output=4) - check(' , , ', 2, 2, 2, expected_output=8) - check(' , -> ', 2, 2, expected_output=4) - check(' i ', [1], expected_output=[1]) - check(' i -> ', [1], expected_output=1) - check(' i -> i ', [1], expected_output=[1]) - check(' i , i ', [2], [2], expected_output=4) - check(' i , i -> i ', [2], [2], expected_output=[4]) - - # Test tensors with 0 size dimensions - check('i', [], expected_output=[]) - check(' i j -> j', [[], []], expected_output=[]) - check('ij->i', [[], []], expected_output=[0., 0.]) - check(' i j k , k -> i j ', (3, 0, 6), (6,), expected_output=[[], [], []]) - - # Test broadcasting - check('i,j', [2], [1, 2], expected_output=[[2, 4]]) - check('i,ij->ij', [1, 2], [[1, 2, 3], [2, 3, 4]], expected_output=[[1, 2, 3], [4, 6, 8]]) - - # Test ellipsis broadcasting - check('...', 1, expected_output=1) - check('...->', 1, expected_output=1) - check('...->...', 1, expected_output=1) - check('...', [1], expected_output=[1]) - check('...->', [1], expected_output=1) - check('i...->i', [1], expected_output=[1]) - check('i...->...i', [1], expected_output=[1]) - check('...a->', [[2], [4]], expected_output=6) - check('a...b->ab', [[[1], [2]], [[3], [4]]], expected_output=[[3], [7]]) - - def test_einsum_error_cases(self, device): - def check(equation, operands, regex, exception=RuntimeError): - with self.assertRaisesRegex(exception, r'einsum\(\) ' + regex): - torch.einsum(equation, operands) - - x = torch.rand(2) - y = torch.rand(2, 3) - - check('', [], r'must provide at least one operand') - check('. ..', [x], r'found \'.\' for operand 0 that is not part of any ellipsis') - check('... ...', [x], r'found \'.\' for operand 0 for which an ellipsis was already found') - check('A', [x], r'operand subscript must be in range \[a, z\] but found A for operand 0') - check(',', [x], r'fewer operands were provided than specified in the equation') - check('', [x, x], r'more operands were provided than specified in the equation') - check('', [x], r'the number of subscripts in the equation \(0\) does not match the number ' - r'of dimensions \(1\) for operand 0 and no ellipsis was given') - check('ai', [x], r'the number of subscripts in the equation \(2\) does not match the number ' - r'of dimensions \(1\) for operand 0 and no ellipsis was given') - check('ai...', [x], r'the number of subscripts in the equation \(2\) is more than the number ' - r'of dimensions \(1\) for operand 0') - check('a->... .', [x], r'found \'.\' for output but an ellipsis \(...\) was already found') - check('a->..', [x], r'found \'.\' for output that is not part of any ellipsis \(...\)') - check('a->A', [x], r'subscripts must be in range \[a, z\] but found A for the output') - check('a->aa', [x], r'output subscript a appears more than once in the output') - check('a->i', [x], r'output subscript i does not appear in the equation for any input operand') - check('aa', [y], r'subscript a is repeated for operand 0 but the sizes don\'t match, 3 != 2') - check('a, ba', [x, y], r'operands do not broadcast with remapped shapes \[original->remapped\]: ' - r'\[2\]->\[1, 2\] \[2, 3\]->\[2, 3\]') - def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular, device, dtype): triangle_function = torch.triu if upper else torch.tril @@ -3385,6 +3240,80 @@ def run_test(pivot): if self.device_type == 'cuda': run_test(False) + @onlyCPU + @slowTest + @dtypes(torch.double) + def test_einsum(self, device: torch.device, dtype: torch.dtype) -> None: + # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f + x = torch.randn(5, dtype=dtype, device=device) + y = torch.randn(7, dtype=dtype, device=device) + A = torch.randn(3, 5, dtype=dtype, device=device) + B = torch.randn(2, 5, dtype=dtype, device=device) + C = torch.randn(2, 3, 5, dtype=dtype, device=device) + D = torch.randn(2, 5, 7, dtype=dtype, device=device) + E = torch.randn(7, 9, dtype=dtype, device=device) + F = torch.randn(2, 3, 5, 7, dtype=dtype, device=device) + G = torch.randn(7, 11, 13, dtype=dtype, device=device) + H = torch.randn(4, 4, dtype=dtype, device=device) + I = torch.randn(3, 4, 4, dtype=dtype, device=device) + l = torch.randn(5, 10, dtype=dtype, device=device) + r = torch.randn(5, 20, dtype=dtype, device=device) + w = torch.randn(30, 10, 20, dtype=dtype, device=device) + test_list: List[Union[Tuple[str, torch.Tensor], + Tuple[str, torch.Tensor, torch.Tensor], + Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]] = [ + # -- Vector + ("i->", x), # sum + ("i,i->", x, x), # dot + ("i,i->i", x, x), # vector element-wise mul + ("i,j->ij", x, y), # outer + # -- Matrix + ("ij->ji", A), # transpose + ("ij->j", A), # row sum + ("ij->i", A), # col sum + ("ij,ij->ij", A, A), # matrix element-wise mul + ("ij,j->i", A, x), # matrix vector multiplication + ("ij,kj->ik", A, B), # matmul + ("ij,ab->ijab", A, E), # matrix outer product + # -- Tensor + ("aij,ajk->aik", C, D), # batch matmul + ("ijk,jk->i", C, A), # tensor matrix contraction + ("aij,jk->aik", D, E), # tensor matrix contraction + ("abcd,dfg->abcfg", F, G), # tensor tensor contraction + ("ijk,jk->ik", C, A), # tensor matrix contraction with double indices + ("ijk,jk->ij", C, A), # tensor matrix contraction with double indices + ("ijk,ik->j", C, B), # non contiguous + ("ijk,ik->jk", C, B), # non contiguous with double indices + # -- Diagonal + ("ii", H), # trace + ("ii->i", H), # diagonal + # -- Ellipsis + ("i...->...", H), + ("ki,...k->i...", A.t(), B), + ("k...,jk", A.t(), B), + ("...ii->...i", I), # batch diagonal + # -- Other + ("bn,anm,bm->ba", l, w, r), # as torch.bilinear + ("... ii->...i ", I), # batch diagonal with spaces + ] + for test in test_list: + actual = torch.einsum(test[0], test[1:]) + expected = np.einsum(test[0], *[t.numpy() for t in test[1:]]) + self.assertEqual(expected.shape, actual.shape, msg=test[0]) + self.assertEqual(expected, actual, msg=test[0]) + # test vararg + actual2 = torch.einsum(test[0], *test[1:]) + self.assertEqual(expected.shape, actual2.shape, msg=test[0]) + self.assertEqual(expected, actual2, msg=test[0]) + + def do_einsum(*args): + return torch.einsum(test[0], args) + # FIXME: following test cases fail gradcheck + if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}: + gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:]) + self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps)) + self.assertTrue(A._version == 0) # check that we do not use inplace ops + @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.double) diff --git a/torch/functional.py b/torch/functional.py index 72739018889c..62076a9dc29a 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -296,107 +296,76 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): def einsum(equation, *operands): r"""einsum(equation, *operands) -> Tensor - Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation - based on the Einstein summation convention. - - Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them - in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of - this format are described below, but the general idea is to label every dimension of the input :attr:`operands` - with some subscript and define which subscripts are part of the output. The output is then computed by summing - the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the - output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`. - Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why). - - Equation: - - The :attr:`equation` string specifies the subscripts (lower case letters `['a', 'z']`) for each dimension of - the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a - comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript - must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is - repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand - must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that - appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order. - The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based - on the subscripts, and then summing out the dimensions whose subscripts are not part of the output. - - Optionally, the output subscripts can be explicitly defined by adding an arrow ('->') at the end of the equation - followed by the subscripts for the output. For instance, the following equation computes the transpose of a - matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and - at most once for the output. - - Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis. - Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts, - e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth - dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the - 'shape' of the ellipsis (the size of the dimensions covered by them) must broadcast together. If the output is not - explicitly defined with the arrow ('->') notation, the ellipsis will come first in the output (left-most dimensions), - before the subscript labels that appear exactly once for the input operands. e.g. the following equation implements - batch matrix multiplication `'...ij,...jk'`. - - A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis, - arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands. - - .. note:: - - ``torch.einsum`` handles ellipsis ('...') differently from NumPy in that it allows dimensions - covered by the ellipsis to be summed over, that is, ellipsis are not required to be part of the output. - - .. note:: - - This function does not optimize the given expression, so a different formula for the same computation may - run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) - can optimize the formula for you. - - Args: - equation (string): The subscripts for the Einstein summation. - operands (Tensor): The operands to compute the Einstein sum of. - - Examples:: - - # trace - >>> torch.einsum('ii', torch.randn(4, 4)) - tensor(-1.2104) - - # diagonal - >>> torch.einsum('ii->i', torch.randn(4, 4)) - tensor([-0.1034, 0.7952, -0.2433, 0.4545]) - - # outer product - >>> x = torch.randn(5) - >>> y = torch.randn(4) - >>> torch.einsum('i,j->ij', x, y) - tensor([[ 0.1156, -0.2897, -0.3918, 0.4963], - [-0.3744, 0.9381, 1.2685, -1.6070], - [ 0.7208, -1.8058, -2.4419, 3.0936], - [ 0.1713, -0.4291, -0.5802, 0.7350], - [ 0.5704, -1.4290, -1.9323, 2.4480]]) - - # batch matrix multiplication - >>> As = torch.randn(3,2,5) - >>> Bs = torch.randn(3,5,4) - >>> torch.einsum('bij,bjk->bik', As, Bs) - tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], - [-1.6706, -0.8097, -0.8025, -2.1183]], - - [[ 4.2239, 0.3107, -0.5756, -0.2354], - [-1.4558, -0.3460, 1.5087, -0.8530]], - - [[ 2.8153, 1.8787, -4.3839, -1.2112], - [ 0.3728, -2.1131, 0.0921, 0.8305]]]) - - # batch permute - >>> A = torch.randn(2, 3, 4, 5) - >>> torch.einsum('...ij->...ji', A).shape - torch.Size([2, 3, 5, 4]) - - # equivalent to torch.nn.functional.bilinear - >>> A = torch.randn(3,5,4) - >>> l = torch.randn(2,5) - >>> r = torch.randn(2,4) - >>> torch.einsum('bn,anm,bm->ba', l, A, r) - tensor([[-0.3430, -5.2405, 0.4494], - [ 0.3311, 5.5201, -3.0356]]) - """ +This function provides a way of computing multilinear expressions (i.e. sums of products) using the +Einstein summation convention. + +Args: + equation (string): The equation is given in terms of lower case letters (indices) to be associated + with each dimension of the operands and result. The left hand side lists the operands + dimensions, separated by commas. There should be one index letter per tensor dimension. + The right hand side follows after `->` and gives the indices for the output. + If the `->` and right hand side are omitted, it implicitly defined as the alphabetically + sorted list of all indices appearing exactly once in the left hand side. + The indices not apprearing in the output are summed over after multiplying the operands + entries. + If an index appears several times for the same operand, a diagonal is taken. + Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred, + the ellipsis dimensions are at the beginning of the output. + operands (Tensor): The operands to compute the Einstein sum of. + +.. note:: + + This function does not optimize the given expression, so a different formula for the same computation may + run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) + can optimize the formula for you. + +Examples:: + + >>> x = torch.randn(5) + >>> y = torch.randn(4) + >>> torch.einsum('i,j->ij', x, y) # outer product + tensor([[-0.0570, -0.0286, -0.0231, 0.0197], + [ 1.2616, 0.6335, 0.5113, -0.4351], + [ 1.4452, 0.7257, 0.5857, -0.4984], + [-0.4647, -0.2333, -0.1883, 0.1603], + [-1.1130, -0.5588, -0.4510, 0.3838]]) + + + >>> A = torch.randn(3,5,4) + >>> l = torch.randn(2,5) + >>> r = torch.randn(2,4) + >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear + tensor([[-0.3430, -5.2405, 0.4494], + [ 0.3311, 5.5201, -3.0356]]) + + + >>> As = torch.randn(3,2,5) + >>> Bs = torch.randn(3,5,4) + >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication + tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], + [-1.6706, -0.8097, -0.8025, -2.1183]], + + [[ 4.2239, 0.3107, -0.5756, -0.2354], + [-1.4558, -0.3460, 1.5087, -0.8530]], + + [[ 2.8153, 1.8787, -4.3839, -1.2112], + [ 0.3728, -2.1131, 0.0921, 0.8305]]]) + + >>> A = torch.randn(3, 3) + >>> torch.einsum('ii->i', A) # diagonal + tensor([-0.7825, 0.8291, -0.1936]) + + >>> A = torch.randn(4, 3, 3) + >>> torch.einsum('...ii->...i', A) # batch diagonal + tensor([[-1.0864, 0.7292, 0.0569], + [-0.9725, -1.0270, 0.6493], + [ 0.5832, -1.1716, -1.5084], + [ 0.4041, -1.1690, 0.8570]]) + + >>> A = torch.randn(2, 3, 4, 5) + >>> torch.einsum('...ij->...ji', A).shape # batch permute + torch.Size([2, 3, 5, 4]) +""" if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in operands) and has_torch_function(operands): return handle_torch_function(einsum, operands, equation, *operands) From 88ebf6f894a61039d2ac0077438b0ad3637c7a71 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 7 Dec 2020 11:54:03 -0800 Subject: [PATCH 014/250] Revert D25304229: [pytorch][PR] Add type annotations to torch.onnx.* modules Test Plan: revert-hammer Differential Revision: D25304229 (https://github.com/pytorch/pytorch/commit/8bc6023d7a822ea6936b7460027f29558149008d) Original commit changeset: b01b21ddbf86 fbshipit-source-id: bc3308176e2c70423f29f694e9db94828213e7d6 --- mypy.ini | 24 ++++++++++++ torch/_C/__init__.pyi.in | 68 +-------------------------------- torch/_C/_onnx.pyi | 1 - torch/onnx/symbolic_helper.py | 23 +++++------ torch/onnx/symbolic_opset8.py | 2 +- torch/onnx/symbolic_opset9.py | 9 ++--- torch/onnx/symbolic_registry.py | 5 +-- torch/onnx/utils.py | 24 +++++------- 8 files changed, 51 insertions(+), 105 deletions(-) diff --git a/mypy.ini b/mypy.ini index 0b9f5497162c..f4b37f15a820 100644 --- a/mypy.ini +++ b/mypy.ini @@ -143,6 +143,30 @@ ignore_errors = True [mypy-torch.nn.intrinsic.qat.modules.conv_fused] ignore_errors = True +[mypy-torch.onnx.operators] +ignore_errors = True + +[mypy-torch.onnx.symbolic_opset8] +ignore_errors = True + +[mypy-torch.onnx.symbolic_opset9] +ignore_errors = True + +[mypy-torch.onnx.symbolic_opset11] +ignore_errors = True + +[mypy-torch.onnx.symbolic_caffe2] +ignore_errors = True + +[mypy-torch.onnx.symbolic_helper] +ignore_errors = True + +[mypy-torch.onnx.symbolic_registry] +ignore_errors = True + +[mypy-torch.onnx.utils] +ignore_errors = True + [mypy-torch.multiprocessing.pool] ignore_errors = True diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 1452718ed793..cbb5b2452e21 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -165,7 +165,7 @@ def wait(fut: Future) -> Any: ... def _collect_all(futures: List[Future]) -> Future: ... def unify_type_list(types: List[JitType]) -> JitType: ... -def _freeze_module(module: ScriptModule, preserved_attrs: List[str] = [], freeze_interfaces: _bool = True) -> ScriptModule: ... +def _freeze_module(module: ScriptModule, preserved_attrs: List[str], freeze_interfaces: _bool = True) -> ScriptModule: ... def _is_tracing() -> _bool: ... def _jit_init() -> _bool: ... def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ... @@ -217,8 +217,6 @@ def _jit_get_trigger_value(trigger_name: str) -> _int: ... # Defined in torch/csrc/jit/python/script_init.cpp ResolutionCallback = Callable[[str], Callable[..., Any]] -# Defined in torch/csrc/jit/python/script_init.cpp -# and torch/csrc/jit/python/init.cpp def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ... def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ... def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ... @@ -248,54 +246,6 @@ def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallb def _create_module_with_type(ty: JitType) -> ScriptModule: ... def _run_emit_module_hook(m: ScriptModule): ... def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def, new_name: str) -> Def: ... - -def _jit_pass_lower_all_tuples(graph: Graph) -> None: ... -def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ... -def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ... -def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], onnx_shape_inference: _bool = False) -> None: ... -def _jit_pass_fixup_onnx_loop_node_inputs(n: Node) -> None: ... -def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ... -def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ... -def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ... -def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ... -def _jit_pass_fuse_addmm(graph: Graph) -> None: ... -def _jit_pass_onnx_preprocess(graph: Graph) -> None: ... -def _jit_pass_onnx_prepare_inplace_ops_for_onnx(graph: Graph) -> None: ... -def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ... -def _jit_pass_onnx_remove_print(graph: Graph) -> None: ... -def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ... -def _jit_pass_onnx_unpack_quantized_weights( - graph: Graph, - paramsDict: Dict[str, IValue] -) -> Dict[str, IValue]: ... -def _jit_pass_onnx_quantization_insert_permutes( - graph: Graph, - paramsDict: Dict[str, IValue] -) -> Dict[str, IValue]: ... -def _jit_pass_custom_pattern_based_rewrite_graph(pattern: str, fused_node_name: str, graph: Graph) -> None: ... -def _jit_pass_erase_number_types(graph: Graph) -> None: ... -def _jit_pass_onnx(graph: Graph, _jit_pass_onnx: _onnx.OperatorExportTypes) -> Graph: ... -def _jit_pass_onnx_scalar_type_analysis(graph: Graph) -> None: ... -def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size: _bool) -> None: ... -def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ... -def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ... -def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ... -def _jit_pass_inline_fork_wait(graph: Graph) -> None: ... -def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... -def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ... -def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... -def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ... -def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ... -def _jit_decay_packed_param_input_types(graph: Graph) -> None: ... -def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ... -def _jit_pass_onnx_block( - old_block: Block, - new_block: Block, - operator_export_type: _onnx.OperatorExportTypes, - env: Dict[Value, Value] -) -> None: ... -def _jit_pass_fixup_onnx_controlflow_node(n: Node, opset_version: _int) -> Node: ... - def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ... def _jit_script_compile_overload( qualname: str, @@ -331,18 +281,8 @@ def import_ir_module_from_buffer( extra_files: Dict[str, Any] ) -> ScriptModule: ... -def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ... -def _check_onnx_proto(proto: str) -> None: ... -def _propagate_and_assign_input_shapes( - graph: Graph, - inputs: Tuple[Tensor, ...], - with_grad: _bool, - propagate: _bool -) -> Graph: ... - # Defined in torch/torch/csrc/jit/ir/ir.h class Graph: - def eraseInput(self, i: _int) -> None: ... ... # Defined in torch/csrc/jit/ir/ir.h @@ -426,8 +366,8 @@ class ScriptFunction: def qualified_name(self) -> str: ... class ScriptMethod: - graph: Graph ... + class ModuleDict: def __init__(self, mod: ScriptModule) -> None: ... def items(self) -> List[Tuple[str, Any]]: ... @@ -438,10 +378,6 @@ class ParameterDict: class BufferDict: def __init__(self, mod: ScriptModule) -> None: ... -# Defined in torch/csrc/jit/api/module.h -class Module: - ... - # Defined in torch/csrc/Module.cpp def _initExtension(shm_manager_path: str) -> None: ... # THPModule_initExtension def _autograd_init() -> _bool: ... # THPAutograd_initExtension diff --git a/torch/_C/_onnx.pyi b/torch/_C/_onnx.pyi index 7ab3cd9c567d..51f16566ce6c 100644 --- a/torch/_C/_onnx.pyi +++ b/torch/_C/_onnx.pyi @@ -29,7 +29,6 @@ class OperatorExportTypes(Enum): ONNX_ATEN = ... ONNX_ATEN_FALLBACK = ... RAW = ... - ONNX_FALLTHROUGH = ... class TrainingMode(Enum): EVAL = ... diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 8fd8ce3ea760..5e9430f995f8 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -2,7 +2,6 @@ import torch import warnings from sys import maxsize as maxsize -from typing import Set import torch.onnx # This import monkey-patches graph manipulation methods on Graph, used for the @@ -126,7 +125,7 @@ def decorator(fn): def wrapper(g, *args, **kwargs): # some args may be optional, so the length may be smaller assert len(arg_descriptors) >= len(args) - args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)] # type: ignore + args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)] # only support _outputs in kwargs assert len(kwargs) <= 1 if len(kwargs) == 1: @@ -233,11 +232,11 @@ def _select_helper(g, self, dim, index, apply_reshape=True): def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False): if _export_onnx_opset_version <= 9: - from torch.onnx.symbolic_opset9 import _slice as _slice9 - return _slice9(g, input, axes, starts, ends) + from torch.onnx.symbolic_opset9 import _slice + return _slice(g, input, axes, starts, ends) else: - from torch.onnx.symbolic_opset10 import _slice as _slice10 - return _slice10(g, input, axes, starts, ends, steps, dynamic_slice) + from torch.onnx.symbolic_opset10 import _slice + return _slice(g, input, axes, starts, ends, steps, dynamic_slice) def _hardtanh_helper(g, input, min_val, max_val): if _export_onnx_opset_version <= 10: @@ -381,7 +380,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_ size = g.op("Concat", *size, axis_i=0) scale_factor = _interpolate_size_to_scales(g, input, size, dim) else: - return _unimplemented("interpolate", "Both size and scales are None in __interpolate") + return _unimplemented("Both size and scales are None in __interpolate") return scale_factor, mode @@ -389,7 +388,7 @@ def _unbind_helper(g, self, dim, _outputs): if _export_onnx_opset_version <= 9: from torch.onnx.symbolic_opset9 import unbind else: - from torch.onnx.symbolic_opset11 import unbind # type: ignore[no-redef] + from torch.onnx.symbolic_opset11 import unbind return unbind(g, self, dim, _outputs) @@ -397,8 +396,7 @@ def _scatter_helper(g, self, dim, index, src): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import scatter else: - # for mypy, scatter was imported two lines above - from torch.onnx.symbolic_opset11 import scatter # type: ignore + from torch.onnx.symbolic_opset11 import scatter return scatter(g, self, dim, index, src) @@ -446,8 +444,7 @@ def _index_fill_reshape_helper(g, self, dim, index): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import scatter else: - # for mypy, scatter was imported two lines above - from torch.onnx.symbolic_opset11 import scatter # type: ignore + from torch.onnx.symbolic_opset11 import scatter if self.type().dim() is None: return _unimplemented("index_fill", "input rank not accesible") @@ -635,4 +632,4 @@ def _cast_func_template(to_i, g, input, non_blocking): # Global set to store the list of quantized operators in the network. # This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX. -_quantized_ops: Set[int] = set() +_quantized_ops = set() diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py index e4023dab2320..c0c1d48ebec0 100644 --- a/torch/onnx/symbolic_opset8.py +++ b/torch/onnx/symbolic_opset8.py @@ -4,7 +4,7 @@ import torch.onnx.symbolic_opset9 as sym_opset9 from torch.onnx.symbolic_helper import parse_args, _unimplemented, _block_list_in_opset, _try_get_scalar_type -from torch.onnx.symbolic_opset9 import _cast_Float # type: ignore +from torch.onnx.symbolic_opset9 import _cast_Float import warnings diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 8630f48a62ad..e395ce5c703f 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -13,8 +13,6 @@ import torch.onnx.symbolic_helper as sym_help from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented -from typing import Optional - import numpy import math import warnings @@ -313,7 +311,7 @@ def _maybe_cast_reduce_op_input(g, self): if dtype is not None: # pytorch reduce-ops cast all other integral types to int64 if not sym_help._is_fp(self) and not (dtype == 'Long'): - self = _cast_Long(g, self, False) # type: ignore + self = _cast_Long(g, self, False) return self @@ -2094,7 +2092,7 @@ def _pack_padded_sequence(g, input, lengths, batch_first): # It's really only necessary because those operators expand to something that # only works with int32 types in Caffe2... if lengths.type().scalarType() != 'Int': - lengths = _cast_Int(g, lengths, False) # type: ignore + lengths = _cast_Int(g, lengths, False) return g.op("prim::PackPadded", input, lengths, outputs=2) @@ -2438,7 +2436,7 @@ def _get_arange_dtype(dtype): def masked_fill(g, self, mask, value): - mask = _cast_Bool(g, mask, False) # type: ignore + mask = _cast_Bool(g, mask, False) value = sym_help._maybe_get_scalar(value) return g.op('Where', mask, sym_help._if_scalar_type_as(g, value, self), self) @@ -2736,7 +2734,6 @@ def as_strided(g, self, sizes, strides, offset=None): sizes = sym_help._maybe_get_const(sizes, 'is') rank = len(strides) self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) - ind: Optional[torch.Tensor] if not sym_help._is_value(sizes): ind = torch.tensor([0], dtype=torch.long) for i, (size, stride) in enumerate(zip(sizes, strides)): diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py index c059e8f2eb31..48114d6c472b 100644 --- a/torch/onnx/symbolic_registry.py +++ b/torch/onnx/symbolic_registry.py @@ -1,7 +1,6 @@ import warnings import importlib from inspect import getmembers, isfunction -from typing import Dict, Tuple, Any, Union # The symbolic registry "_registry" is a dictionary that maps operators # (for a specific domain and opset version) to their symbolic functions. @@ -9,9 +8,9 @@ # The keys are tuples (domain, version), (where domain is a string, and version is an int), # and the operator's name (string). # The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic -_registry: Dict[Tuple[str, int], Dict] = {} +_registry = {} -_symbolic_versions: Dict[Union[int, str], Any] = {} +_symbolic_versions = {} from torch.onnx.symbolic_helper import _onnx_stable_opsets for opset_version in _onnx_stable_opsets: module = importlib.import_module('torch.onnx.symbolic_opset{}'.format(opset_version)) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 3fe19a56c124..5c41306b9ee2 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -18,7 +18,6 @@ from torch.jit import _unique_state_dict from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto -from typing import Union, Tuple, List # the flag to tell the user whether it's in the middle of ONNX export or not @@ -77,7 +76,7 @@ def export(model, args, f, export_params=True, verbose=False, training=None, if aten or export_raw_ir: assert operator_export_type is None assert aten ^ export_raw_ir - operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW + operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW elif operator_export_type is None: if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE: operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK @@ -352,7 +351,6 @@ def _trace_and_get_graph_from_model(model, args): def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes): torch_out = None - params: Union[List, Tuple] if isinstance(model, torch.jit.ScriptModule): try: graph = model.forward.graph @@ -444,7 +442,7 @@ def _model_to_graph(model, args, verbose=False, param_names = input_and_param_names[len(input_and_param_names) - len(params):] params_dict = dict(zip(param_names, params)) - if training is None or training == TrainingMode.EVAL: + if training is None or training == TrainingMode.EVAL or (training == TrainingMode.PRESERVE and not is_originally_training): params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict) if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: @@ -478,7 +476,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t if aten or export_raw_ir: assert operator_export_type is None assert aten ^ export_raw_ir - operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW + operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW elif operator_export_type is None: operator_export_type = OperatorExportTypes.ONNX return _export_to_pretty_string(model, args, f, export_params, verbose, training, @@ -1053,10 +1051,6 @@ def _graph_constant(g, value, dims, type, *args, **kwargs): dims = [1] isscalar = True type = type.lower() - tensor: Union[torch.CharTensor, torch.ShortTensor, - torch.IntTensor, torch.LongTensor, - torch.HalfTensor, torch.FloatTensor, - torch.DoubleTensor] if type == "char": tensor = torch.CharTensor(*dims) elif type == "short": @@ -1074,7 +1068,7 @@ def _graph_constant(g, value, dims, type, *args, **kwargs): else: raise ValueError("Unknown type, type should be one of the following strings: " "char, short, int, long, half, float, double") - tensor.fill_(value) # type: ignore + tensor.fill_(value) if isscalar: return g.op("Constant", *args, value_z=tensor, **kwargs) return g.op("Constant", *args, value_t=tensor, **kwargs) @@ -1147,8 +1141,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names): dynamic_axes[key] = value_dict -torch._C.Graph.op = _graph_op # type: ignore -torch._C.Graph.at = _graph_at # type: ignore -torch._C.Block.op = _block_op # type: ignore -torch._C.Graph.constant = _graph_constant # type: ignore -torch._C.Node.__getitem__ = _node_getitem # type: ignore +torch._C.Graph.op = _graph_op +torch._C.Graph.at = _graph_at +torch._C.Block.op = _block_op +torch._C.Graph.constant = _graph_constant +torch._C.Node.__getitem__ = _node_getitem From d6b5f3ad98a883941be88029a36956e7b879a605 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Mon, 7 Dec 2020 14:28:15 -0800 Subject: [PATCH 015/250] Add object-based collective APIs to public docs (#48909) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48909 Adds these new APIs to the documentation ghstack-source-id: 117965961 Test Plan: CI Reviewed By: mrshenli Differential Revision: D25363279 fbshipit-source-id: af6889d377f7b5f50a1a77a36ab2f700e5040150 --- docs/source/distributed.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index f5bce396054b..b35a34fc0265 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -384,16 +384,24 @@ Collective functions .. autofunction:: broadcast +.. autofunction:: broadcast_object_list + .. autofunction:: all_reduce .. autofunction:: reduce .. autofunction:: all_gather +.. autofunction:: all_gather_object + .. autofunction:: gather +.. autofunction:: gather_object + .. autofunction:: scatter +.. autofunction:: scatter_object_list + .. autofunction:: reduce_scatter .. autofunction:: all_to_all From b77ca9e829a7f919c06446ee8de1ca6dd540a134 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Mon, 7 Dec 2020 14:28:15 -0800 Subject: [PATCH 016/250] [Docs] Add examples for new object-based c10d APIs (#43932) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43932 Adds some basic examples to the documentation for each of the newly added object-based collectibves. ghstack-source-id: 117965966 Test Plan: CI Reviewed By: mrshenli Differential Revision: D23441838 fbshipit-source-id: 91344612952cfcaa71f08ccf2a2c9ed162ca9c89 --- torch/distributed/distributed_c10d.py | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 13a950024af9..1081c6ee0e44 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1393,6 +1393,16 @@ def all_gather_object(object_list, obj, group=group.WORLD): known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. + + Example:: + >>> # Note: Process group initialization omitted on each rank. + >>> import torch.distributed as dist + >>> # Assumes world_size of 3. + >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object + >>> output = [None for _ in gather_objects] + >>> dist.all_gather_object(output, gather_objects[dist.get_rank()]) + >>> output + ['foo', 12, {1: 2}] """ if _rank_not_in_group(group): return @@ -1467,6 +1477,21 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD): known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. + + Example:: + >>> # Note: Process group initialization omitted on each rank. + >>> import torch.distributed as dist + >>> # Assumes world_size of 3. + >>> gather_objects = ["foo", 12, {1: 2}] # any picklable object + >>> output = [None for _ in gather_objects] + >>> dist.gather_object( + gather_objects[dist.get_rank()], + output if dist.get_rank() == 0 else None, + dst=0 + ) + >>> # On rank 0 + >>> output + ['foo', 12, {1: 2}] """ if _rank_not_in_group(group): return @@ -1556,6 +1581,18 @@ def broadcast_object_list(object_list, src, group=group.WORLD): is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. + + Example:: + >>> # Note: Process group initialization omitted on each rank. + >>> import torch.distributed as dist + >>> if dist.get_rank() == 0: + >>> # Assumes world_size of 3. + >>> objects = ["foo", 12, {1: 2}] # any picklable object + >>> else: + >>> objects = [None, None, None] + >>> dist.broadcast_object_list(objects, src=0) + >>> broadcast_objects + ['foo', 12, {1: 2}] """ if _rank_not_in_group(group): return @@ -1634,6 +1671,21 @@ def scatter_object_list( is known to be insecure. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Only call this function with data you trust. + + Example:: + >>> # Note: Process group initialization omitted on each rank. + >>> import torch.distributed as dist + >>> if dist.get_rank() == 0: + >>> # Assumes world_size of 3. + >>> objects = ["foo", 12, {1: 2}] # any picklable object + >>> else: + >>> # Can be any list on non-src ranks, elements are not used. + >>> objects = [None, None, None] + >>> output_list = [None] + >>> dist.scatter_object_list(output_list, objects, src=0) + >>> # Rank i gets objects[i]. For example, on rank 2: + >>> output_list + [{1: 2}] """ if _rank_not_in_group(group): return From f67259fe897bda05a69db54a6b184c5f20bb1368 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 7 Dec 2020 15:24:33 -0800 Subject: [PATCH 017/250] Fix CI by removing gen_pyi from mypy-stirct.ini (#48961) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/48961 Reviewed By: janeyx99 Differential Revision: D25383152 Pulled By: malfet fbshipit-source-id: ce0226398522342256d0d701edc13955d1095a0d --- mypy-strict.ini | 1 - 1 file changed, 1 deletion(-) diff --git a/mypy-strict.ini b/mypy-strict.ini index ddd369ebe621..42fc73abf1cc 100644 --- a/mypy-strict.ini +++ b/mypy-strict.ini @@ -35,7 +35,6 @@ files = tools/codegen/gen.py, tools/autograd/gen_trace_type.py, tools/autograd/gen_variable_factories.py, tools/autograd/load_derivatives.py, - tools/pyi/gen_pyi.py, torch/utils/benchmark/utils/common.py, torch/utils/benchmark/utils/timer.py, torch/utils/benchmark/utils/valgrind_wrapper/*.py, From 7629612f9f5a2ad63e67a723a82273b318cf28a7 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Mon, 7 Dec 2020 16:10:18 -0800 Subject: [PATCH 018/250] Update torch.randint documentation to include missing note (#48787) Summary: Fixes https://github.com/pytorch/pytorch/issues/46497 Includes note about returning dtype torch.int64. Current documentation: https://pytorch.org/docs/stable/generated/torch.randint.html?highlight=randint#torch.randint New documentation: ![image](https://user-images.githubusercontent.com/14858254/101196939-48977d00-3616-11eb-90a5-a7b706e8505f.png) Pull Request resolved: https://github.com/pytorch/pytorch/pull/48787 Test Plan: Built documentation and checked generated docs Reviewed By: ailzhang Differential Revision: D25339421 Pulled By: H-Huang fbshipit-source-id: c2ecaacaeb57971fe7fba0d9d54f3c61b0fd04ce --- torch/_torch_docs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 56aec4668b0d..51200dc6b406 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -6786,7 +6786,7 @@ def merge_dicts(*dicts): The shape of the tensor is defined by the variable argument :attr:`size`. -.. note: +.. note:: With the global dtype default (``torch.float32``), this function returns a tensor with dtype ``torch.int64``. From e3893b867fd39cf4f10a129ba9f689eebf10f82b Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 7 Dec 2020 16:13:41 -0800 Subject: [PATCH 019/250] Reenable some BF16 tests on CUDA (#48805) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/48805 Reviewed By: agolynski Differential Revision: D25375885 Pulled By: ailzhang fbshipit-source-id: 2e19fe725ae9450bd1a2bc4e2d308c59b9f94fac --- test/test_tensor_creation_ops.py | 3 +-- test/test_torch.py | 44 +++++++++++++++++++------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index b355005b1c69..9be3e6db5bf0 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -14,7 +14,7 @@ IS_WINDOWS) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, - onlyCPU, skipCUDAIfNotRocm, largeTensorTest, precisionOverride, dtypes, + onlyCPU, largeTensorTest, precisionOverride, dtypes, onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU) # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests @@ -2581,7 +2581,6 @@ def test_arange_device_vs_cpu(self, device, dtype): self.assertEqual(cpu_tensor, device_tensor) @onlyCUDA - @skipCUDAIfNotRocm def test_arange_bfloat16(self, device): ref_tensor = torch.tensor([0, 1, 2, 3], dtype=torch.bfloat16, device=device) bfloat16_tensor = torch.arange(0, 4, dtype=torch.bfloat16, device=device) diff --git a/test/test_torch.py b/test/test_torch.py index 2d181c3b9400..ad88128617c9 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6316,10 +6316,6 @@ def test_copy_broadcast(self, device) -> None: torch.uint8 ] -# _types2 adds bfloat16 type to _types only on ROCm. Should eventually be unified -# with _types when bfloat16 bringup is complete on all platforms. -_types2 = _types + [torch.bfloat16] if TEST_WITH_ROCM else _types - _float_types = [torch.half, torch.float, torch.double] _complex_types = [torch.cfloat, torch.cdouble] @@ -6601,10 +6597,14 @@ def inner(self, device, dtype): ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)], 1e-2, 1e-5, 1e-5, _float_types + _complex_types, _cpu_types, False), ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False), - ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), @@ -6618,10 +6618,14 @@ def inner(self, device, dtype): ('lcm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 0, 0, 0, [torch.int16, torch.int32, torch.int64], [torch.int16, torch.int32, torch.int64], True, [onlyOnCPUAndCUDA]), - ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), # TODO: can't check negative case - cross-device copy is contiguous ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)], @@ -6705,12 +6709,16 @@ def inner(self, device, dtype): torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d), True], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), - ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), - lambda t, d: [], 1e-2, 1e-1, 1e-5, _types2, _cpu_types, False), - ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False), - ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False), - ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False), - ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False), + ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), lambda t, d: [], 1e-2, 1e-1, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, 1e-5, _types, _cpu_types, False), ('sum', 'complex', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False), ('sum', 'complex_dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False), From adbb74ded9a2a15bff7c9cbca93cbbf930341354 Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Mon, 7 Dec 2020 17:10:12 -0800 Subject: [PATCH 020/250] [package] pre-emptively install submodules (#48799) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48799 Python's IMPORT_FROM bytecode will bypass the import infrastructure when a packaging being loaded as part of a cirular dependency is being accessed from the module _before_ that package has finished loading and is installed on the module. Since we cannot override the lookup on sys.modules, this PR pre-emptively does the module assignment before running the submodules initialization code. Note: this appears to work, but it is not clear to me why python doesn't do this by default. It is possible that the logic for creating modules is flexible enough in generic python that this interception between creating the module and running its code is not always possible. Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D25312467 Pulled By: zdevito fbshipit-source-id: 6fe3132af29364ccb2b3cabdd2b847d0a09eb515 --- torch/package/importer.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/torch/package/importer.py b/torch/package/importer.py index 455119d18a1e..ffd474733021 100644 --- a/torch/package/importer.py +++ b/torch/package/importer.py @@ -140,7 +140,7 @@ def load_pickle(self, package: str, resource: str, map_location=None) -> Any: def _read_extern(self): return self.zip_reader.get_record('extern_modules').decode('utf-8').splitlines(keepends=False) - def _make_module(self, name: str, filename: Optional[str], is_package: bool): + def _make_module(self, name: str, filename: Optional[str], is_package: bool, parent: str): spec = importlib.machinery.ModuleSpec(name, self, is_package=is_package) # type: ignore module = importlib.util.module_from_spec(spec) self.modules[name] = module @@ -150,12 +150,18 @@ def _make_module(self, name: str, filename: Optional[str], is_package: bool): ns['__file__'] = filename ns['__cached__'] = None ns['__builtins__'] = self.patched_builtins + + # pre-emptively install on the parent to prevent IMPORT_FROM from trying to + # access sys.modules + self._install_on_parent(parent, name, module) + if filename is not None: code = self._compile_source(filename) exec(code, ns) + return module - def _load_module(self, name: str): + def _load_module(self, name: str, parent: str): cur : _PathNode = self.root for atom in name.split('.'): if not isinstance(cur, _PackageNode) or atom not in cur.children: @@ -166,7 +172,7 @@ def _load_module(self, name: str): if isinstance(cur, _ExternNode): module = self.modules[name] = importlib.import_module(name) return module - return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode)) # type: ignore + return self._make_module(name, cur.source_file, isinstance(cur, _PackageNode), parent) # type: ignore def _compile_source(self, fullpath): source = self.zip_reader.get_record(fullpath) @@ -179,6 +185,14 @@ def get_source(self, module_name) -> str: module = self.import_module(module_name) return self.zip_reader.get_record(module.__file__).decode('utf-8') + def _install_on_parent(self, parent: str, name: str, module: types.ModuleType): + if not parent: + return + # Set the module as an attribute on its parent. + parent_module = self.modules[parent] + if parent_module.__loader__ is self: # type: ignore + setattr(parent_module, name.rpartition('.')[2], module) + # note: copied from cpython's import code, with call to create module replaced with _make_module def _do_find_and_load(self, name): path = None @@ -196,13 +210,10 @@ def _do_find_and_load(self, name): msg = (_ERR_MSG + '; {!r} is not a package').format(name, parent) raise ModuleNotFoundError(msg, name=name) from None - module = self._load_module(name) + module = self._load_module(name, parent) + + self._install_on_parent(parent, name, module) - if parent: - # Set the module as an attribute on its parent. - parent_module = self.modules[parent] - if parent_module.__loader__ is self: # type: ignore - setattr(parent_module, name.rpartition('.')[2], module) return module # note: copied from cpython's import code From 533c837833dd5cec712e1c32dff5c389ed9465cf Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 7 Dec 2020 17:16:41 -0800 Subject: [PATCH 021/250] Register OpInfos for torch.fft transforms (#48427) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48427 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25266218 Pulled By: mruberry fbshipit-source-id: 406e7ed5956bc7445daf8c027c9b4d2c8ff88fa1 --- test/test_jit.py | 2 +- test/test_ops.py | 56 +++-- torch/testing/_internal/common_device_type.py | 2 +- .../_internal/common_methods_invocations.py | 191 ++++++++++++++++-- .../_internal/jit_metaprogramming_utils.py | 15 +- 5 files changed, 216 insertions(+), 50 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index c85fcbd19747..65b9c110f64f 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -15733,7 +15733,7 @@ def fn(*inputs, **kwargs): # alias annotation testing if not is_magic_method and test_name not in EXCLUDE_SCRIPT and not exclude_tensor_method(name, test_name): - check_alias_annotation(name, (self_variable,) + args_variable, kwargs_variable) + check_alias_annotation(name, (self_variable,) + args_variable, kwargs_variable, aten_name=name) check(name) inplace_name = name + '_' diff --git a/test/test_ops.py b/test/test_ops.py index 1be90f2555f8..090232360309 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -231,69 +231,63 @@ def test_variant_consistency_jit(self, device, dtype, op): for sample in samples: # Acquires variants to test + func = op.get_op() method = op.get_method() inplace = op.get_inplace() - variants = (v for v in (method, inplace) if v is not None) - - # Adds function variant to variant list - # TODO: inplace tests currently fail - # variants = (v for v in (op, method, inplace) if v is not None) - variants = (v for v in (op, method) if v is not None) + variants = { + 'function': func, 'method': method, + # TODO: inplace tests currently fail + # 'inplace': inplace, + } # Test traced and scripted consistency - for variant in variants: + for func_type, variant in variants.items(): + if variant is None: + continue + # Create accessor for script function variant - if variant is op: - name = op.name - func_type = 'function' - elif variant is method: - name = op.name - func_type = 'method' - else: # variant is inplace - assert variant is inplace - name = op.name + "_" - func_type = 'inplace' + name = op.name + '_' if func_type == 'inplace' else op.name # run with disable_autodiff_subgraph_inlining(True) to test # autodiff support. Context manager forces the graph to contain # DifferentiableGraph nodes if they are present with disable_autodiff_subgraph_inlining(): def fn(*inputs, **kwargs): - attr = getattr(inputs[0], name) - output = attr(*inputs[1:], **kwargs) + output = func(*inputs, **kwargs) return op.output_func(output) # bfloat16 grad doesn't work for some operators dtypes_to_grad_check = floating_and_complex_types_and(torch.half) \ - if op.skip_bfloat16_grad else floating_and_complex_types_and(torch.half, torch.bfloat16) + if op.skip_bfloat16_grad else floating_and_complex_types_and(torch.half, torch.bfloat16) # Check scripted forward, grad, and grad grad script_fn = create_script_fn(self, name, func_type, op.output_func) - check_against_reference(self, + check_against_reference(self, script_fn, - fn, - (*sample.input,) + sample.args, - sample.kwargs, + fn, + (*sample.input,) + sample.args, + sample.kwargs, no_grad=(dtype not in dtypes_to_grad_check)) # Check traced forward, grad, and grad grad traced_fn = create_traced_fn(self, variant) - check_against_reference(self, + check_against_reference(self, traced_fn, - fn, - (*sample.input,) + sample.args, - sample.kwargs, + fn, + (*sample.input,) + sample.args, + sample.kwargs, no_grad=(dtype not in dtypes_to_grad_check)) # Check alias annotation schema for correctness (make # sure inputs that aren't supposed to be modified aren't) - # Note: only runs in float32 and int64 because schema isn't affected by dtype, + # Note: only runs in float32 and int64 because schema isn't affected by dtype, # so running it on all dtypes is would be excessive if dtype in [torch.float32, torch.int32]: - check_alias_annotation(name, (*sample.input,) + sample.args, sample.kwargs) + check_alias_annotation(name, (*sample.input,) + sample.args, sample.kwargs, + func_type=func_type, aten_name=op.aten_name) - # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample + # Check autodifferentiation of nodes for traced and scripted graphs, only need to check once per sample if dtype is torch.float32: # Sandcastle doesn't fuse nodes if IS_SANDCASTLE: diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py index 0126b1dd0a93..36f02eff0c0f 100644 --- a/torch/testing/_internal/common_device_type.py +++ b/torch/testing/_internal/common_device_type.py @@ -171,7 +171,7 @@ def _construct_test_name(test_name, op, device_type, dtype): if op is not None: - test_name += "_" + op.name + test_name += "_" + op.name.replace('.', '_') test_name += "_" + device_type diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 96d1cd03557e..26be9c9fde3a 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -10,11 +10,11 @@ from typing import List, Tuple, Dict, Any from torch.testing import \ - (make_non_contiguous, _dispatch_dtypes, - floating_types, floating_types_and, floating_and_complex_types, - floating_and_complex_types_and, all_types_and_complex_and, all_types_and) + (make_non_contiguous, _dispatch_dtypes, floating_types, floating_types_and, + floating_and_complex_types, floating_and_complex_types_and, + all_types_and_complex_and, all_types_and) from torch.testing._internal.common_device_type import \ - (skipCUDAIfNoMagma, skipCPUIfNoLapack, + (skipCUDAIfNoMagma, skipCPUIfNoLapack, skipCPUIfNoMkl, skipCUDAIfRocm, expectedAlertNondeterministic, precisionOverride) from torch.testing._internal.common_utils import \ (prod_single_zero, random_square_matrix_of_rank, @@ -22,7 +22,7 @@ random_symmetric_pd_matrix, make_nonzero_det, random_fullrank_matrix_distinct_singular_value, set_rng_seed, TEST_WITH_ROCM, IS_WINDOWS, IS_MACOS, make_tensor, TEST_SCIPY, - torch_to_numpy_dtype_dict) + torch_to_numpy_dtype_dict, TEST_WITH_SLOW) if TEST_SCIPY: import scipy.special @@ -54,6 +54,23 @@ def __init__(self, input, *, args=tuple(), kwargs=None): self.kwargs = kwargs if kwargs is not None else {} +_NOTHING = object() # Unique value to distinguish default from anything else + + +# Extension of getattr to support qualified names +# e.g. _getattr_qual(torch, 'linalg.norm') -> torch.linalg.norm +def _getattr_qual(obj, name, default=_NOTHING): + try: + for path in name.split('.'): + obj = getattr(obj, path) + return obj + except AttributeError: + if default is not _NOTHING: + return default + else: + raise + + # Classes and methods for the operator database class OpInfo(object): """Operator information and helper functions for acquiring it.""" @@ -84,13 +101,16 @@ def __init__(self, skips=tuple(), # information about which tests to skip decorators=None, # decorators to apply to generated tests promotes_integers_to_float=False, # whether op promotes unary output to float or not - sample_inputs_func=None): # function to generate sample inputs + sample_inputs_func=None, # function to generate sample inputs + aten_name=None, # name of the corresponding aten:: operator + ): # Validates the dtypes are generated from the dispatch-related functions for dtype_list in (dtypes, dtypesIfCPU, dtypesIfCUDA, dtypesIfROCM): assert isinstance(dtype_list, (_dispatch_dtypes, type(None))) self.name = name + self.aten_name = aten_name if aten_name is not None else name self.dtypes = set(dtypes) self.dtypesIfCPU = set(dtypesIfCPU) if dtypesIfCPU is not None else self.dtypes @@ -99,12 +119,10 @@ def __init__(self, self._default_test_dtypes = set(default_test_dtypes) if default_test_dtypes is not None else None # NOTE: if the op is unspecified it is assumed to be under the torch namespace - if op is None: - assert hasattr(torch, self.name), f"Can't find torch.{self.name}" - self.op = op if op else getattr(torch, self.name) - self.method_variant = getattr(torch.Tensor, name) if hasattr(torch.Tensor, name) else None + self.op = op if op else _getattr_qual(torch, self.name) + self.method_variant = getattr(torch.Tensor, name, None) inplace_name = name + "_" - self.inplace_variant = getattr(torch.Tensor, inplace_name) if hasattr(torch.Tensor, name) else None + self.inplace_variant = getattr(torch.Tensor, inplace_name, None) self.skip_bfloat16_grad = skip_bfloat16_grad self.test_inplace_grad = test_inplace_grad @@ -289,8 +307,71 @@ def wrapped_fn(x): return wrapped_fn + +# Metadata class for Fast Fourier Transforms in torch.fft. +class SpectralFuncInfo(OpInfo): + """Operator information for torch.fft transforms. """ + + def __init__(self, + name, # the string name of the function + *, + ref=None, # Reference implementation (probably in np.fft namespace) + dtypes=floating_and_complex_types(), + dtypesIfCPU=None, + dtypesIfCUDA=None, + dtypesIfROCM=None, + ndimensional: bool, # Whether dim argument can be a tuple + skips=None, + decorators=None, + **kwargs): + dtypesIfCPU = dtypesIfCPU if dtypesIfCPU is not None else dtypes + dtypesIfCUDA = dtypesIfCUDA if dtypesIfCUDA is not None else dtypes + dtypesIfROCM = dtypesIfROCM if dtypesIfROCM is not None else dtypes + + # gradgrad is quite slow + if not TEST_WITH_SLOW: + skips = skips if skips is not None else [] + skips.append(SkipInfo('TestGradients', 'test_fn_gradgrad')) + + decorators = decorators if decorators is not None else [] + decorators += [skipCPUIfNoMkl, skipCUDAIfRocm] + + super().__init__(name=name, + dtypes=dtypes, + dtypesIfCPU=dtypesIfCPU, + dtypesIfCUDA=dtypesIfCUDA, + dtypesIfROCM=dtypesIfROCM, + skips=skips, + decorators=decorators, + **kwargs) + self.ref = ref if ref is not None else _getattr_qual(np, name) + self.ndimensional = ndimensional + + + def sample_inputs(self, device, dtype, requires_grad=False): + tensor = make_tensor((L, M), device, dtype, + low=None, high=None, + requires_grad=requires_grad) + if self.ndimensional: + return [ + SampleInput(tensor), + SampleInput(tensor, kwargs=dict(dim=(-2,))), + SampleInput(tensor, kwargs=dict(norm='ortho')), + SampleInput(tensor, kwargs=dict(s=(10, 15))), + SampleInput(tensor, kwargs=dict(s=10, dim=1, norm='ortho')), + ] + else: + return [ + SampleInput(tensor), + SampleInput(tensor, kwargs=dict(dim=-2)), + SampleInput(tensor, kwargs=dict(norm='ortho')), + SampleInput(tensor, kwargs=dict(n=15)), + SampleInput(tensor, kwargs=dict(n=10, dim=1, norm='ortho')), + ] + + # Operator database (sorted alphabetically) -op_db: List[Any] = [ +op_db: List[OpInfo] = [ # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952) UnaryUfuncInfo('acos', ref=np.arccos, @@ -448,6 +529,89 @@ def wrapped_fn(x): SkipInfo('TestCommon', 'test_variant_consistency_jit', device_type='cuda', dtypes=[torch.float16]), )), + SpectralFuncInfo('fft.fft', + aten_name='fft_fft', + ref=np.fft.fft, + ndimensional=False, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.fftn', + aten_name='fft_fftn', + ref=np.fft.fftn, + ndimensional=True, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False, + decorators=[precisionOverride( + {torch.float: 1e-4, torch.cfloat: 1e-4})],), + SpectralFuncInfo('fft.hfft', + aten_name='fft_hfft', + ref=np.fft.hfft, + ndimensional=False, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.rfft', + aten_name='fft_rfft', + ref=np.fft.rfft, + ndimensional=False, + dtypes=all_types_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.rfftn', + aten_name='fft_rfftn', + ref=np.fft.rfftn, + ndimensional=True, + dtypes=all_types_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False, + decorators=[precisionOverride({torch.float: 1e-4})],), + SpectralFuncInfo('fft.ifft', + aten_name='fft_ifft', + ref=np.fft.ifft, + ndimensional=False, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.ifftn', + aten_name='fft_ifftn', + ref=np.fft.ifftn, + ndimensional=True, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.ihfft', + aten_name='fft_ihfft', + ref=np.fft.ihfft, + ndimensional=False, + dtypes=all_types_and(torch.bool), + default_test_dtypes=floating_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.irfft', + aten_name='fft_irfft', + ref=np.fft.irfft, + ndimensional=False, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), + SpectralFuncInfo('fft.irfftn', + aten_name='fft_irfftn', + ref=np.fft.irfftn, + ndimensional=True, + dtypes=all_types_and_complex_and(torch.bool), + default_test_dtypes=floating_and_complex_types(), + supports_tensor_out=False, + test_inplace_grad=False,), UnaryUfuncInfo('log', ref=np.log, domain=(0, float('inf')), @@ -644,7 +808,7 @@ def reference_sigmoid(x): return (1 / (1 + np.exp(-x))) return scipy.special.expit(x) - op_db_scipy_reference = [ + op_db_scipy_reference: List[OpInfo] = [ UnaryUfuncInfo('sigmoid', ref=reference_sigmoid, decorators=(precisionOverride({torch.float16: 1e-2, @@ -695,6 +859,7 @@ def reference_sigmoid(x): # Common operator groupings unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo)] +spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)] def index_variable(shape, max_indices): if not isinstance(shape, tuple): diff --git a/torch/testing/_internal/jit_metaprogramming_utils.py b/torch/testing/_internal/jit_metaprogramming_utils.py index 8036c73a6330..4a91394d53c5 100644 --- a/torch/testing/_internal/jit_metaprogramming_utils.py +++ b/torch/testing/_internal/jit_metaprogramming_utils.py @@ -229,8 +229,15 @@ def the_method({}): return {} ''' +def value_to_literal(value): + if isinstance(value, str): + # Quotes string and escapes special characters + return ascii(value) + else: + return str(value) + def get_call(method_name, func_type, args, kwargs): - kwargs_str = ', '.join([k + '=' + str(v) for k, v in kwargs.items()]) + kwargs_str = ', '.join([k + '=' + value_to_literal(v) for k, v in kwargs.items()]) self_arg = args[0] if(func_type == 'method'): args = args[1:] @@ -461,12 +468,12 @@ def make_module(script): return module return script_module -def check_alias_annotation(method_name, args, kwargs): +def check_alias_annotation(method_name, args, kwargs, *, aten_name, func_type='method'): formals, tensors, actuals = get_script_args(args) - call = get_call(method_name, 'method', actuals, kwargs) + call = get_call(method_name, func_type, actuals, kwargs) script = script_template.format(', '.join(formals), call) CU = torch.jit.CompilationUnit(script) - torch._C._jit_check_alias_annotation(CU.the_method.graph, tuple(tensors), method_name) + torch._C._jit_check_alias_annotation(CU.the_method.graph, tuple(tensors), aten_name) def get_nn_module_name_from_kwargs(**kwargs): if 'module_name' in kwargs: From c876d4f477fdebfa8acbc3ebd8042ea8f5ed36dc Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 7 Dec 2020 17:37:59 -0800 Subject: [PATCH 022/250] [Gradient Compression] Let the dtype of created low-rank tensors P and Q be the same type as the input tensor (#48902) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48902 Previously if the dtype of input gradients is FP16, matrix multiplications will fail, because the created low-rank tensors P and Q use FP32 dtype. Now let the dtype of P and Q be the same as the input tensor. Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 ghstack-source-id: 117962078 Test Plan: buck test mode/dev-nosan caffe2/test/distributed:c10d -- test_powerSGD_ddp_comm_hook_nccl Reviewed By: rohan-varma Differential Revision: D25362071 fbshipit-source-id: e68753ff23bb480605b02891e128202ed0f8a587 --- .../algorithms/ddp_comm_hooks/powerSGD_hook.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index e1d475a34425..81b876685a3c 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -162,11 +162,17 @@ def create_low_rank_tensor(fill_random_values, rng): # only fork on CPU and then move the generated tensor to the CUDA device. torch.manual_seed(rng.randint(1_000_000_000)) return torch.randn( - square_side_length, state.matrix_approximation_rank, device="cpu" + square_side_length, + state.matrix_approximation_rank, + device="cpu", + dtype=input_tensor.dtype, ).to(device) else: return torch.empty( - square_side_length, state.matrix_approximation_rank, device=device + square_side_length, + state.matrix_approximation_rank, + device=device, + dtype=input_tensor.dtype, ) p = create_low_rank_tensor(fill_random_values=False, rng=state.rng) From bea88ee1d0179e9cc3c29d105cc009e2027ee0d7 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Mon, 7 Dec 2020 18:59:38 -0800 Subject: [PATCH 023/250] Added entry for torch.linalg.cond to linalg.rst (#48941) Summary: This PR makes documentation for `cond` available at https://pytorch.org/docs/master/linalg.html I forgot to include this change in https://github.com/pytorch/pytorch/issues/45832. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48941 Reviewed By: ngimel Differential Revision: D25379244 Pulled By: mruberry fbshipit-source-id: c8c0a0b8a05c17025d6c3cea405b2add369e2019 --- docs/source/linalg.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst index b5d78572c06b..a3bee886f062 100644 --- a/docs/source/linalg.rst +++ b/docs/source/linalg.rst @@ -13,6 +13,7 @@ Functions --------- .. autofunction:: cholesky +.. autofunction:: cond .. autofunction:: det .. autofunction:: eigh .. autofunction:: eigvalsh From 3aeb9cc85ddbed7516e34fa71475995af5b31812 Mon Sep 17 00:00:00 2001 From: Bharat123rox Date: Mon, 7 Dec 2020 19:33:47 -0800 Subject: [PATCH 024/250] [DOCS]Correct docs for torch.lu_solve (#47762) Summary: Fixes https://github.com/pytorch/pytorch/issues/43498 by correcting the function signature of `torch.lu_solve` Pull Request resolved: https://github.com/pytorch/pytorch/pull/47762 Reviewed By: ljk53 Differential Revision: D24900259 Pulled By: ailzhang fbshipit-source-id: 2a43170bde57e03d44025b23e3abcda169cfc9e2 --- torch/_torch_docs.py | 2 +- torch/overrides.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 51200dc6b406..d9f7e8018264 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -4638,7 +4638,7 @@ def merge_dicts(*dicts): add_docstr(torch.lu_solve, r""" -lu_solve(input, LU_data, LU_pivots, *, out=None) -> Tensor +lu_solve(b, LU_data, LU_pivots, *, out=None) -> Tensor Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted LU factorization of A from :meth:`torch.lu`. diff --git a/torch/overrides.py b/torch/overrides.py index e8a3933a1954..2af6e36ea914 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -505,7 +505,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.lt: lambda input, other, out=None: -1, torch.less: lambda input, other, out=None: -1, torch.lu: lambda A, pivot=True, get_infos=False, out=None: -1, - torch.lu_solve: lambda input, LU_data, LU_pivots, out=None: -1, + torch.lu_solve: lambda b, LU_data, LU_pivots, out=None: -1, torch.margin_ranking_loss: lambda input1, input2, target, margin=0, size_average=None, reduce=None, reduction='mean': -1, torch.masked_fill: lambda input, mask, value: -1, torch.masked_scatter: lambda input, mask, source: -1, From 5533be5170c37561c486ade50c1697a2be50bbe0 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 7 Dec 2020 19:47:27 -0800 Subject: [PATCH 025/250] CUDA BF16 backwards (#48809) Summary: Looks like there's no test? Pull Request resolved: https://github.com/pytorch/pytorch/pull/48809 Reviewed By: mruberry Differential Revision: D25378998 Pulled By: ngimel fbshipit-source-id: d16789892902b5a20828e8c7b414b478de33c4a5 --- .../cuda/BinaryMiscBackwardOpsKernels.cu | 60 +++++++++---------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu index ed7e2190f75e..a385aa721522 100644 --- a/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryMiscBackwardOpsKernels.cu @@ -16,10 +16,8 @@ namespace native { void sigmoid_backward_kernel_cuda(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "sigmoid_backward_cuda", [&]() { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "sigmoid_backward_cuda", [&] { - gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a * (scalar_t(1.) - b) * b; - }); + gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return a * (scalar_t(1.) - b) * b; }); }); } @@ -31,31 +29,29 @@ void logit_backward_kernel_cuda(TensorIterator& iter, Scalar eps_scalar) { iter.dtype(), "logit_cuda", [&]() { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "logit_cuda", [&] { - using T_ACC = acc_type; - const T_ACC eps = eps_scalar.to(); - if (eps < T_ACC(0)) { - gpu_kernel( - iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { - const T_ACC dy_acc = static_cast(dy); - const T_ACC x_acc = static_cast(x); - return (x_acc < T_ACC(0) || x_acc > T_ACC(1)) - ? std::numeric_limits::quiet_NaN() - : dy_acc / (x_acc * (T_ACC(1) - x_acc)); - }); - } else { - const T_ACC lo = eps; - const T_ACC hi = T_ACC(1) - eps; - gpu_kernel( - iter, [lo, hi] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { - const T_ACC dy_acc = static_cast(dy); - const T_ACC x_acc = static_cast(x); - return (x_acc < lo || x_acc > hi) - ? T_ACC(0) - : dy_acc / (x_acc * (T_ACC(1) - x_acc)); - }); - } - }); + using T_ACC = acc_type; + const T_ACC eps = eps_scalar.to(); + if (eps < T_ACC(0)) { + gpu_kernel( + iter, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { + const T_ACC dy_acc = static_cast(dy); + const T_ACC x_acc = static_cast(x); + return (x_acc < T_ACC(0) || x_acc > T_ACC(1)) + ? std::numeric_limits::quiet_NaN() + : dy_acc / (x_acc * (T_ACC(1) - x_acc)); + }); + } else { + const T_ACC lo = eps; + const T_ACC hi = T_ACC(1) - eps; + gpu_kernel( + iter, [lo, hi] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { + const T_ACC dy_acc = static_cast(dy); + const T_ACC x_acc = static_cast(x); + return (x_acc < lo || x_acc > hi) + ? T_ACC(0) + : dy_acc / (x_acc * (T_ACC(1) - x_acc)); + }); + } }); } @@ -68,10 +64,8 @@ void tanh_backward_kernel_cuda(TensorIterator& iter) { }); } else { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "tanh_backward_cuda", [&]() { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "tanh_backward_cuda", [&] { - gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { - return a * (scalar_t{1.} - b * b); - }); + gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return a * (scalar_t{1.} - b * b); }); }); } From 881e9583b22891147560004b5c4ff594f7319291 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Mon, 7 Dec 2020 20:14:02 -0800 Subject: [PATCH 026/250] docker: Add make variable to add docker build args (#48942) Summary: Adds an extra make variable 'EXTRA_DOCKER_BUILD_FLAGS' that allows us to add extra docker build flags to the docker build command. Example: make -f docker.Makefile EXTRA_DOCKER_BUILD_FLAGS=--no-cache devel-image Signed-off-by: Eli Uriegas Pull Request resolved: https://github.com/pytorch/pytorch/pull/48942 Reviewed By: walterddr Differential Revision: D25376288 Pulled By: seemethere fbshipit-source-id: 9cf2c2a5e01d505fa54447604ecd653dcbdd42e1 --- docker.Makefile | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/docker.Makefile b/docker.Makefile index 3cd59f146e38..3af77ab9c7d1 100644 --- a/docker.Makefile +++ b/docker.Makefile @@ -1,31 +1,38 @@ -DOCKER_REGISTRY = docker.io -DOCKER_ORG = $(shell docker info 2>/dev/null | sed '/Username:/!d;s/.* //') -DOCKER_IMAGE = pytorch -DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE) +DOCKER_REGISTRY = docker.io +DOCKER_ORG = $(shell docker info 2>/dev/null | sed '/Username:/!d;s/.* //') +DOCKER_IMAGE = pytorch +DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE) ifeq ("$(DOCKER_ORG)","") $(warning WARNING: No docker user found using results from whoami) -DOCKER_ORG = $(shell whoami) +DOCKER_ORG = $(shell whoami) endif -CUDA_VERSION = 11.0 -CUDNN_VERSION = 8 -BASE_RUNTIME = ubuntu:18.04 -BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04 +CUDA_VERSION = 11.0 +CUDNN_VERSION = 8 +BASE_RUNTIME = ubuntu:18.04 +BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-ubuntu18.04 # The conda channel to use to install pytorch / torchvision -INSTALL_CHANNEL = pytorch +INSTALL_CHANNEL = pytorch -PYTHON_VERSION = 3.7 +PYTHON_VERSION = 3.7 # Can be either official / dev -BUILD_TYPE = dev -BUILD_PROGRESS = auto -BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE) \ - --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \ - --build-arg CUDA_VERSION=$(CUDA_VERSION) \ - --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) -DOCKER_BUILD = DOCKER_BUILDKIT=1 docker build --progress=$(BUILD_PROGRESS) --target $(BUILD_TYPE) -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) $(BUILD_ARGS) . -DOCKER_PUSH = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG) +BUILD_TYPE = dev +BUILD_PROGRESS = auto +BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \ + --build-arg CUDA_VERSION=$(CUDA_VERSION) \ + --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) +EXTRA_DOCKER_BUILD_FLAGS ?= +DOCKER_BUILD = DOCKER_BUILDKIT=1 \ + docker build \ + --progress=$(BUILD_PROGRESS) \ + $(EXTRA_DOCKER_BUILD_FLAGS) \ + --target $(BUILD_TYPE) \ + -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \ + $(BUILD_ARGS) . +DOCKER_PUSH = docker push $(DOCKER_FULL_NAME):$(DOCKER_TAG) .PHONY: all all: devel-image From c3a90bedd4312ee8a9ec673ee4fbe0ffca7fa28b Mon Sep 17 00:00:00 2001 From: Jiatong Zhou Date: Mon, 7 Dec 2020 21:25:11 -0800 Subject: [PATCH 027/250] Move aten::__contains__.int_list for lite jit (#48950) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48950 Needed by noise suppression model Test Plan: build Reviewed By: linbinyu Differential Revision: D25321582 fbshipit-source-id: fbc67fc35087c5f44b7ab68d1485b2b916747723 --- torch/csrc/jit/runtime/register_prim_ops.cpp | 5 +++++ torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index f031d957449b..d9bffa7e4644 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -822,6 +822,11 @@ RegisterOperators reg( return 0; }, aliasAnalysisFromSchema()), + OperatorGenerator( + TORCH_SELECTIVE_SCHEMA( + "aten::__contains__.int_list(int[] l, int item) -> bool"), + listContains, + aliasAnalysisFromSchema()), OperatorGenerator( TORCH_SELECTIVE_SCHEMA( "aten::__contains__.str_list(str[] l, str item) -> bool"), diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp index 0be346246656..b63a2a228508 100644 --- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp @@ -711,10 +711,6 @@ RegisterOperators reg2({ // `listContains` is not implemented for non-primitive types // TODO: Add List[bool] once .to> doesn't throw an error - Operator( - "aten::__contains__.int_list(int[] l, int item) -> bool", - listContains, - aliasAnalysisFromSchema()), Operator( "aten::__contains__.float_list(float[] l, float item) -> bool", listContains, From cb6233aa538114fce55380a79978f3e576eb7cfe Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Mon, 7 Dec 2020 22:46:56 -0800 Subject: [PATCH 028/250] Fix some convoluted(?) code (#48893) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48893 This simplifies some code which is written in an interesting way. It may be that this was intentional, but I don't recognize the pattern being used. Test Plan: N/A - Sandcastle Reviewed By: igorsugak Differential Revision: D25358283 fbshipit-source-id: 19bcf01cbb117843e08df0237e6a03ea77958078 --- aten/src/THC/THCAsmUtils.cuh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh index 6375891bd7f2..be0bf6ffa1ba 100644 --- a/aten/src/THC/THCAsmUtils.cuh +++ b/aten/src/THC/THCAsmUtils.cuh @@ -94,15 +94,16 @@ __device__ __forceinline__ int getLaneId() { #if defined(__HIP_PLATFORM_HCC__) __device__ __forceinline__ unsigned long long int getLaneMaskLt() { - std::uint64_t m = (1ull << getLaneId()) - 1ull; + const std::uint64_t m = (1ull << getLaneId()) - 1ull; return m; +} #else __device__ __forceinline__ unsigned getLaneMaskLt() { unsigned mask; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); return mask; -#endif } +#endif #if defined (__HIP_PLATFORM_HCC__) __device__ __forceinline__ unsigned long long int getLaneMaskLe() { @@ -119,27 +120,28 @@ __device__ __forceinline__ unsigned getLaneMaskLe() { #if defined(__HIP_PLATFORM_HCC__) __device__ __forceinline__ unsigned long long int getLaneMaskGt() { - std::uint64_t m = getLaneMaskLe(); + const std::uint64_t m = getLaneMaskLe(); return m ? ~m : m; +} #else __device__ __forceinline__ unsigned getLaneMaskGt() { unsigned mask; asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask)); return mask; -#endif } +#endif #if defined(__HIP_PLATFORM_HCC__) __device__ __forceinline__ unsigned long long int getLaneMaskGe() { - std::uint64_t m = getLaneMaskLt(); + const std::uint64_t m = getLaneMaskLt(); return ~m; +} #else __device__ __forceinline__ unsigned getLaneMaskGe() { unsigned mask; asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask)); return mask; -#endif } - +#endif #endif // THC_ASM_UTILS_INC From 32b098baf936b63ee23017f6bba4f3e4c56f22a6 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Tue, 8 Dec 2020 00:35:23 -0800 Subject: [PATCH 029/250] Add and adjust kernel launch checks (#46727) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46727 This adds kernel launch safety checks to a number of kernels. See D24309971 (https://github.com/pytorch/pytorch/commit/353e7f940f548e0a0cb3b420b4190b4624ae9b41) for context. Test Plan: The existing pre-commit test rigs are used. Reviewed By: ngimel Differential Revision: D24334303 fbshipit-source-id: b6433f6be109fc8dbe789e91f3cbfbc31fd15951 --- aten/src/ATen/native/cuda/AveragePool3d.cu | 19 ++- aten/src/ATen/native/cuda/CUDALoops.cuh | 6 +- .../ATen/native/cuda/DistributionTemplates.h | 5 +- aten/src/ATen/native/cuda/EmbeddingBag.cu | 2 +- .../ATen/native/cuda/FractionalMaxPool2d.cu | 4 +- aten/src/ATen/native/cuda/Indexing.cu | 72 ++++----- .../src/ATen/native/cuda/MultinomialKernel.cu | 5 +- aten/src/ATen/native/cuda/Normalization.cuh | 18 ++- aten/src/ATen/native/cuda/ROCmLoops.cuh | 4 +- aten/src/ATen/native/cuda/RangeFactories.cu | 6 +- aten/src/ATen/native/cuda/Reduce.cuh | 7 +- aten/src/ATen/native/cuda/ReflectionPad.cu | 18 +-- .../ATen/native/cuda/ReplicationPadding.cu | 54 ++++--- aten/src/ATen/native/cuda/ScanKernels.cu | 9 +- .../ATen/native/cuda/ScatterGatherKernel.cu | 10 +- aten/src/ATen/native/cuda/Shape.cu | 8 +- aten/src/ATen/native/cuda/SoftMax.cu | 143 +++++++++--------- aten/src/ATen/native/cuda/Sorting.cu | 6 +- aten/src/ATen/native/cuda/WeightNorm.cu | 8 +- 19 files changed, 205 insertions(+), 199 deletions(-) diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu index 4214b4dace19..388b04dba76a 100644 --- a/aten/src/ATen/native/cuda/AveragePool3d.cu +++ b/aten/src/ATen/native/cuda/AveragePool3d.cu @@ -317,16 +317,17 @@ __global__ void avg_pool3d_cuda_update_grad_input( } } -#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ +#define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ avg_pool3d_cuda_update_output \ <<>>( \ - work_input.packed_accessor64(), \ - work_output.packed_accessor64(), \ + work_input.packed_accessor64(), \ + work_output.packed_accessor64(), \ kT, kH, \ dT, dH, dW, \ padT, padH, padW, \ count_include_pad, \ offsetZ, divisor); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ break void avg_pool3d_out_cuda_template( @@ -443,11 +444,10 @@ void avg_pool3d_out_cuda_template( padT, padH, padW, count_include_pad, offsetZ, divisor); - break; + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; } - AT_CUDA_CHECK(cudaGetLastError()); - totalZ -= 65535; offsetZ += 65535; } @@ -581,8 +581,7 @@ void avg_pool3d_backward_out_cuda_template( kT, kH, kW, 1.0f/divide_factor, offsetZ); - - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; @@ -614,6 +613,7 @@ void avg_pool3d_backward_out_cuda_template( padT, padH, padW, count_include_pad, offsetZ, divisor); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { avg_pool3d_cuda_update_grad_input @@ -625,10 +625,9 @@ void avg_pool3d_backward_out_cuda_template( padT, padH, padW, count_include_pad, offsetZ, divisor); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - AT_CUDA_CHECK(cudaGetLastError()); - totalZ -= 65535; offsetZ += 65535; } diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index 91401e994ebd..d11a5bb074c5 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -101,9 +101,11 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t switch (vec_size) { case 4: vectorized_elementwise_kernel<4, func_t, array_t><<>>(N, f, data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; case 2: vectorized_elementwise_kernel<2, func_t, array_t><<>>(N, f, data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; case 1: { auto input_calc = TrivialOffsetCalculator(); @@ -111,12 +113,12 @@ static inline void launch_vectorized_kernel(int64_t N, const func_t& f, array_t auto loader = memory::LoadWithoutCast(); auto storer = memory::StoreWithoutCast(); unrolled_elementwise_kernel<<>>(N, f, data, input_calc, output_calc, loader, storer); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } default: TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size"); } - AT_CUDA_CHECK(cudaGetLastError()); } template @@ -127,7 +129,7 @@ static inline void launch_unrolled_kernel(int64_t N, const func_t& f, array_t da int64_t grid = (N + block_work_size - 1) / block_work_size; auto stream = at::cuda::getCurrentCUDAStream(); unrolled_elementwise_kernel<<>>(N, f, data, ic, oc, l, s); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 8cfc6c10f1ba..1b4f228bf229 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -155,6 +155,7 @@ void distribution_nullary_kernel(at::TensorIterator& iter, *out = transform_func(rand); } ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { auto offset_calc = make_offset_calculator<1>(iter); distribution_elementwise_grid_stride_kernel<<>>( @@ -167,8 +168,8 @@ void distribution_nullary_kernel(at::TensorIterator& iter, *out = transform_func(rand); } ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - AT_CUDA_CHECK(cudaGetLastError()); } // Binary kernel @@ -260,10 +261,12 @@ void distribution_binary_kernel(TensorIterator &iter, PhiloxCudaState philox_arg distribution_binary_elementwise_kernel<<>>( numel, f, philox_args, output_data, input_data_1, input_data_2, TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { distribution_binary_elementwise_kernel<<>>( numel, f, philox_args, output_data, input_data_1, input_data_2, make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 5bed5532baee..651261cf6408 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -245,8 +245,8 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad, max_indices.data_ptr(), grad.data_ptr(), grad_weight.data_ptr(), stride, numBags); C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); }); + }); return grad_weight; } diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu index bee3cfa4d436..41fc2dea5856 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu @@ -273,8 +273,8 @@ void fractional_max_pool2d_backward_out_cuda_template( <<>>( devGradInput, devGradOutput, devIndices); C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - ); + } + ); } }// namespace diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 47527935fe73..4e88ee34a9b4 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -233,18 +233,18 @@ void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & va AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, value_.scalar_type(), "indexing_backward", [&] { AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "indexing_backward", [&] { - indexing_backward_kernel<<>>( - sorted_indices.data_ptr(), - orig_indices.data_ptr(), - value_.data_ptr(), - src_.data_ptr(), - num_indices, - sliceSize, - strideBefore, - nElemBefore); - }); + indexing_backward_kernel<<>>( + sorted_indices.data_ptr(), + orig_indices.data_ptr(), + value_.data_ptr(), + src_.data_ptr(), + num_indices, + sliceSize, + strideBefore, + nElemBefore); + }); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); - AT_CUDA_CHECK(cudaGetLastError()); if (permuted) self.copy_(src_.permute(inversePerm)); } @@ -476,21 +476,23 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; -#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \ +#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \ indexAddSmallIndex \ - <<>>( \ - selfInfo, sourceInfo, indexInfo, \ - selfAddDim, sourceAddDim, sliceSize, selfAddDimSize); + <<>>( \ + selfInfo, sourceInfo, indexInfo, \ + selfAddDim, sourceAddDim, sliceSize, selfAddDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, \ - SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR) \ + SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR) \ indexAddLargeIndex \ - <<>>( \ - selfInfo, sourceInfo, indexInfo, \ - selfAddDim, sourceAddDim, sourceTotalSize, \ - (IDX_IS_MAJOR) ? sliceSize : numIndex, \ - selfAddDimSize); + SELF_DIM, SOURCE_DIM, IDX_DIM, IDX_IS_MAJOR> \ + <<>>( \ + selfInfo, sourceInfo, indexInfo, \ + selfAddDim, sourceAddDim, sourceTotalSize, \ + (IDX_IS_MAJOR) ? sliceSize : numIndex, \ + selfAddDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); @@ -725,22 +727,24 @@ void index_select_out_cuda_impl(Tensor& out, const Tensor& self, long dim, int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; -#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ +#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ indexSelectSmallIndex \ - <<>>( \ - outInfo, selfInfo, indicesInfo, \ - outSelectDim, selfSelectDim, static_cast(sliceSize), \ - selfSelectDimSize); + <<>>( \ + outInfo, selfInfo, indicesInfo, \ + outSelectDim, selfSelectDim, static_cast(sliceSize), \ + selfSelectDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); #define LARGE_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, \ - DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR) \ + DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR) \ indexSelectLargeIndex \ - <<>>( \ - outInfo, selfInfo, indicesInfo, \ - outSelectDim, selfSelectDim, static_cast(outTotalSize), \ - static_cast((IDX_IS_MAJOR) ? sliceSize : numIndices), \ - selfSelectDimSize); + DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR> \ + <<>>( \ + outInfo, selfInfo, indicesInfo, \ + outSelectDim, selfSelectDim, static_cast(outTotalSize), \ + static_cast((IDX_IS_MAJOR) ? sliceSize : numIndices), \ + selfSelectDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index a8779d3d97af..3d59617903b4 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -74,6 +74,7 @@ void renormRows(Tensor& t) { <<>>(t.data_ptr(), rows, cols); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -348,6 +349,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n self_v.stride(0), self_v.stride(1) ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { // Generic, slow implementation with memory allocations @@ -399,12 +401,11 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n numDist, numCategories, prefixSum.data_ptr(), normDist.data_ptr()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } }); - AT_CUDA_CHECK(cudaGetLastError()); - if (inputSize == 1) { result.resize_({n_sample}); } diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh index a0d37dd44be1..8355ac004308 100644 --- a/aten/src/ATen/native/cuda/Normalization.cuh +++ b/aten/src/ATen/native/cuda/Normalization.cuh @@ -558,6 +558,7 @@ void batch_norm_cuda_template(Tensor& output_, Tensor& save_mean_, Tensor& save_ if (!train) { batch_norm_transform_input_kernel <<>> (input, output, running_mean, running_var, weight, bias, epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { // for the reduction, we cannot use blocks for the batch dim, but if we have few threads in // the feature dimension, we'll use some threads for blocks @@ -566,10 +567,11 @@ void batch_norm_cuda_template(Tensor& output_, Tensor& save_mean_, Tensor& save_ dim3 threads(tf, std::max(1, MAX_BLOCK_SIZE/tf)); batch_norm_collect_statistics_kernel <<>> (input, epsilon, momentum, running_mean, running_var, save_mean, save_invstd); + C10_CUDA_KERNEL_LAUNCH_CHECK(); batch_norm_transform_input_kernel <<>> (input, output, save_mean, save_invstd, weight, bias, epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - AT_CUDA_CHECK(cudaGetLastError()); } template @@ -615,7 +617,7 @@ std::tuple batch_norm_backward_cuda_template(const Tenso batch_norm_backward_kernel <<>> (input, grad_output, grad_input, grad_weight, grad_bias, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return std::make_tuple(grad_input_, grad_weight_, grad_bias_); } @@ -654,7 +656,7 @@ std::tuple batch_norm_stats_cuda_template(const Tensor& input_, dim3 threads(tf, std::max(1, MAX_BLOCK_SIZE/tf)); batch_norm_collect_statistics_kernel <<>> (input, epsilon, 0.0, dummy_mean, dummy_invstd, mean, invstd); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return std::make_tuple(mean_, invstd_); } @@ -694,7 +696,7 @@ void batch_norm_elemt_cuda_template(Tensor& output_, const Tensor& input_, const dim3 threads_trans(tf, tb); batch_norm_transform_input_kernel <<>> (input, output, mean, invstd, weight, bias, epsilon); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -727,7 +729,7 @@ std::tuple batch_norm_gather_stats_cuda_template(const Tensor& m int grid = std::max(1, features/block); batch_norm_reduce_statistics_kernel <<>> (mean, invstd, save_mean, save_invstd, running_mean, running_var, epsilon, momentum, counts); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return std::make_tuple(save_mean_, save_invstd_); } @@ -777,7 +779,7 @@ std::tuple batch_norm_backward_reduce_cuda_templ batch_norm_backward_reduce_kernel <<>> (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_); } @@ -819,7 +821,7 @@ Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Te dim3 threads_trans(tf, tb); batch_norm_backward_elemt_kernel <<>> (input, grad_output, mean, invstd, weight, mean_dy, mean_dy_xmu, grad_input); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return grad_input_reshaped.view(input_.sizes()); } @@ -853,7 +855,7 @@ std::tuple batch_norm_update_stats_cuda_template( // NB: epsilon is unused by the Var transform, so we set it to 0 batch_norm_collect_statistics_kernel <<>> (input, 0., momentum, running_mean, running_var, save_mean, save_var); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return std::make_tuple(save_mean_, save_var_); } diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh index b5115c6dcdfb..c339364b5a02 100644 --- a/aten/src/ATen/native/cuda/ROCmLoops.cuh +++ b/aten/src/ATen/native/cuda/ROCmLoops.cuh @@ -134,7 +134,7 @@ static void launch_kernel(int64_t N, const func_t& f) { dim3 grid((N + block.x * vt - 1) / (block.x * vt)); auto stream = at::cuda::getCurrentCUDAStream(); elementwise_kernel<<>>(N, f); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -296,7 +296,7 @@ static void launch_kernel(int64_t N, const func_t& f, array_t data) { int64_t grid = (N + block_work_size - 1) / block_work_size; auto stream = at::cuda::getCurrentCUDAStream(); elementwise_kernel<<>>(N, f, data); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template::value, int> = 0> diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu index 4286f05111b6..107c3c28fdac 100644 --- a/aten/src/ATen/native/cuda/RangeFactories.cu +++ b/aten/src/ATen/native/cuda/RangeFactories.cu @@ -39,8 +39,10 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) { using scalar_t = typename function_traits::result_type; if (N <= std::numeric_limits::max()) { elementwise_kernel_with_index<<>>(N, f, output.data_ptr()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { elementwise_kernel_with_index<<>>(N, f, output.data_ptr()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } @@ -105,7 +107,6 @@ Tensor& linspace_cuda_out(Tensor& result, Scalar start, Scalar end, c10::optiona result.copy_(r); } - AT_CUDA_CHECK(cudaGetLastError()); return result; } @@ -164,7 +165,6 @@ Tensor& logspace_cuda_out(Tensor& result, Scalar start, Scalar end, c10::optiona result.copy_(r); } - AT_CUDA_CHECK(cudaGetLastError()); return result; } @@ -201,7 +201,6 @@ Tensor& range_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) { }); - AT_CUDA_CHECK(cudaGetLastError()); return result; } @@ -263,7 +262,6 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) { } }); - AT_CUDA_CHECK(cudaGetLastError()); return result; } diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 618088cefb3a..ea797e6011af 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -817,15 +817,16 @@ static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) switch(config.output_vec_size) { case 4: reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; case 2: reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; default: reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - - AT_CUDA_CHECK(cudaGetLastError()); } class AccumulationBuffer { @@ -872,7 +873,7 @@ int get_output_vec_size(TensorIterator &iter) { vec_size /= 2; } }; - + uint64_t base_address = reinterpret_cast(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t); update_vec_size(base_address); diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu index 2b182f32b5e7..95a6825d507f 100644 --- a/aten/src/ATen/native/cuda/ReflectionPad.cu +++ b/aten/src/ATen/native/cuda/ReflectionPad.cu @@ -200,10 +200,9 @@ void reflection_pad1d_out_template( grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>( input.data_ptr(), output.data_ptr(), input_w, pad_l, pad_r); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); - - AT_CUDA_CHECK(cudaGetLastError()); } void reflection_pad1d_backward_out_template( @@ -213,7 +212,7 @@ void reflection_pad1d_backward_out_template( if (grad_input.numel() == 0) { return; } - + TORCH_CHECK(canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); @@ -252,15 +251,14 @@ void reflection_pad1d_backward_out_template( grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>( grad_input.data_ptr(), grad_output.data_ptr(), input_w, pad_l, pad_r); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); - - AT_CUDA_CHECK(cudaGetLastError()); } void reflection_pad2d_out_template( Tensor &output, const Tensor &input_, IntArrayRef padding) { - + TORCH_CHECK(canUse32BitIndexMath(input_), "input tensor must fit into 32-bit index math"); @@ -331,10 +329,9 @@ void reflection_pad2d_out_template( input.data_ptr(), output.data_ptr(), input_w, input_h, pad_t, pad_b, pad_l, pad_r); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); - - AT_CUDA_CHECK(cudaGetLastError()); } void reflection_pad2d_backward_out_template( @@ -344,7 +341,7 @@ void reflection_pad2d_backward_out_template( if (grad_input.numel() == 0) { return; } - + TORCH_CHECK(canUse32BitIndexMath(input), "input tensor must fit into 32-bit index math"); TORCH_CHECK(canUse32BitIndexMath(grad_output_), @@ -393,10 +390,9 @@ void reflection_pad2d_backward_out_template( grad_input.data_ptr(), grad_output.data_ptr(), input_w, input_h, pad_t, pad_b, pad_l, pad_r); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); - - AT_CUDA_CHECK(cudaGetLastError()); } } // namespace diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu index b896a47afed9..c80a98ddf13b 100644 --- a/aten/src/ATen/native/cuda/ReplicationPadding.cu +++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu @@ -222,7 +222,7 @@ void replication_pad1d_out_cuda_template( (numInputDims == 3 && input.size(1) != 0 && input.size(2) != 0), "Expected 2D or 3D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", input.sizes()); - + if (numInputDims == 3) { numBatch = input.size(0); planeDim++; @@ -238,17 +238,17 @@ void replication_pad1d_out_cuda_template( " Calculated output W: ", outputW); if (numInputDims == 2) { - output.resize_({numPlanes, outputW}); + output.resize_({numPlanes, outputW}); } else { output.resize_({numBatch, numPlanes, outputW}); } - + if (input.numel() == 0) { return; } AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "replication_pad1d_cuda", [&] { + input.scalar_type(), "replication_pad1d_cuda", [&] { if (numInputDims == 2) { auto input_ = input.unsqueeze(0); auto output_ = output.unsqueeze(0); @@ -263,6 +263,7 @@ void replication_pad1d_out_cuda_template( replication_pad_forward_kernel1d <<>>(devInput, devOutput, padL, padR); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { auto devInput = input.packed_accessor64(); auto devOutput = output.packed_accessor64(); @@ -275,10 +276,10 @@ void replication_pad1d_out_cuda_template( replication_pad_forward_kernel1d <<>>(devInput, devOutput, padL, padR); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - } + } ); - AT_CUDA_CHECK(cudaGetLastError()); } void replication_pad1d_backward_out_cuda_template( @@ -323,8 +324,8 @@ void replication_pad1d_backward_out_cuda_template( auto gradInput_ = gradInput; auto gradOutput_ = gradOutput; if (numInputDims == 2) { - gradInput_ = gradInput.unsqueeze(0); - gradOutput_ = gradOutput.unsqueeze(0); + gradInput_ = gradInput.unsqueeze(0); + gradOutput_ = gradOutput.unsqueeze(0); } auto devGradInput = gradInput_.packed_accessor64(); auto devGradOutput = gradOutput_.packed_accessor64(); @@ -338,9 +339,8 @@ void replication_pad1d_backward_out_cuda_template( replication_pad_backward_kernel <<>>(devGradInput, devGradOutput, padL, padR); - } - ); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); } void replication_pad2d_out_cuda_template( @@ -387,19 +387,17 @@ void replication_pad2d_out_cuda_template( " Calculated output H: ", outputH, " W: ", outputW); if (numInputDims == 3) { - output.resize_({numPlanes, outputH, outputW}); + output.resize_({numPlanes, outputH, outputW}); } else { output.resize_({numBatch, numPlanes, outputH, outputW}); } - + if (input.numel() == 0) { return; } AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "replication_pad2d_cuda", [&] { - - + input.scalar_type(), "replication_pad2d_cuda", [&] { if (numInputDims == 3) { auto input_ = input.unsqueeze(0); auto output_ = output.unsqueeze(0); @@ -415,6 +413,7 @@ void replication_pad2d_out_cuda_template( replication_pad_forward_kernel2d <<>>( devInput, devOutput, padT, padB, padL, padR); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { auto devInput = input.packed_accessor64(); auto devOutput = output.packed_accessor64(); @@ -428,10 +427,10 @@ void replication_pad2d_out_cuda_template( replication_pad_forward_kernel2d <<>>(devInput, devOutput, padT, padB, padL, padR); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - } + } ); - AT_CUDA_CHECK(cudaGetLastError()); } void replication_pad2d_backward_out_cuda_template( @@ -499,9 +498,9 @@ void replication_pad2d_backward_out_cuda_template( replication_pad_backward_kernel <<>>(devGradInput, devGradOutput, padT, padB, padL, padR); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); - AT_CUDA_CHECK(cudaGetLastError()); } static inline void shapeCheck3d( @@ -650,10 +649,9 @@ void replication_pad3d_out_cuda_template( if (input.numel() == 0) { return; } - - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "replication_pad3d_cuda", [&] { + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "replication_pad3d_cuda", [&] { if (numInputDims == 4) { auto input_ = input.unsqueeze(0); auto output_ = output.unsqueeze(0); @@ -670,6 +668,7 @@ void replication_pad3d_out_cuda_template( replication_pad_forward_kernel3d <<>>( devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { auto devInput = input.packed_accessor64(); auto devOutput = output.packed_accessor64(); @@ -684,10 +683,10 @@ void replication_pad3d_out_cuda_template( replication_pad_forward_kernel3d <<>>( devInput, devOutput, pfront, pback, ptop, pbottom, pleft, pright); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } - } + } ); - AT_CUDA_CHECK(cudaGetLastError()); } void replication_pad3d_backward_out_cuda_template( @@ -726,8 +725,7 @@ void replication_pad3d_backward_out_cuda_template( gradInput.zero_(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "replication_pad3d_backward_cuda", [&] { - + input.scalar_type(), "replication_pad3d_backward_cuda", [&] { auto gradInput_ = gradInput; auto gradOutput_ = gradOutput; if (numInputDims == 4) { @@ -747,9 +745,9 @@ void replication_pad3d_backward_out_cuda_template( replication_pad_backward_kernel <<>>( devGradInput, devGradOutput, pfront, pback, ptop, pbottom, pleft, pright); - } + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } ); - AT_CUDA_CHECK(cudaGetLastError()); } } // namespace diff --git a/aten/src/ATen/native/cuda/ScanKernels.cu b/aten/src/ATen/native/cuda/ScanKernels.cu index 099512912203..384854505054 100644 --- a/aten/src/ATen/native/cuda/ScanKernels.cu +++ b/aten/src/ATen/native/cuda/ScanKernels.cu @@ -183,7 +183,7 @@ __host__ void scan_outer_dim_with_indices(const Tensor& self, Tensor& values, Te tensor_kernel_scan_outer_dim_with_indices<<>>( self.data_ptr(), values.data_ptr(), indices.data_ptr(), num_orows, num_irows, row_size, init, binary_op); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -199,7 +199,7 @@ __host__ void scan_innermost_dim_with_indices(const Tensor& self, Tensor& values tensor_kernel_scan_innermost_dim_with_indices<<>>( self.data_ptr(), values.data_ptr(), indices.data_ptr(), num_rows, row_size, init, binary_op); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -436,7 +436,7 @@ __host__ void scan_outer_dim(const Tensor& self, Tensor& result, tensor_kernel_scan_outer_dim<<>>( result.data_ptr(), self.data_ptr(), num_orows, num_irows, row_size, init, binary_op); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -456,7 +456,7 @@ void scan_innermost_dim(const Tensor& self, Tensor& result, scalar_t init, Binar tensor_kernel_scan_innermost_dim<<>>( result.data_ptr(), self.data_ptr(), num_rows, row_size, init, binary_op); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -485,6 +485,7 @@ void scan_cub(const Tensor& self, Tensor& result, scalar_t init, BinaryFunction result.data_ptr() + i - 1, self.data_ptr() + i, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } size_t temp_storage_bytes = 0; AT_CUDA_CHECK(cub::DeviceScan::InclusiveScan( diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu index 552384b45945..66ac81f5ecbf 100644 --- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu +++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu @@ -72,11 +72,11 @@ static void _launch_scatter_gather_kernel(int64_t N, const func_t& f) { return; } - dim3 block(nt); - dim3 grid((N + block.x * vt - 1) / (block.x * vt)); - auto stream = at::cuda::getCurrentCUDAStream(); + const dim3 block(nt); + const dim3 grid((N + block.x * vt - 1) / (block.x * vt)); + const auto stream = at::cuda::getCurrentCUDAStream(); _scatter_gather_elementwise_kernel<<>>(N, f); - AT_CUDA_CHECK(cudaGetLastError()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } @@ -494,5 +494,5 @@ REGISTER_DISPATCH(scatter_fill_stub, &scatter_fill_cuda_kernel); REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel); REGISTER_DISPATCH(scatter_reduce_stub, &scatter_reduce_cuda_kernel); REGISTER_DISPATCH(scatter_scalar_reduce_stub, &scatter_scalar_reduce_cuda_kernel); - + }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index 7632438ba523..2831292845ec 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -294,7 +294,8 @@ void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, #define HANDLE_CASE(DIMS) \ HIP_CatArrayBatchedCopy<<<\ catGrid, applyBlock, 0, stream.stream()>>>(\ - data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); + data, d_inputs, outputParam, dimension, outputParam.tensorStride[dimension]); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); switch (nDims) { case 1: HANDLE_CASE(1); @@ -310,7 +311,6 @@ void hip_parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, break; } #undef HANDLE_CASE - AT_CUDA_CHECK(cudaGetLastError()); } } @@ -404,7 +404,8 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, #define HANDLE_CASE(DIMS) \ CatArrayBatchedCopy<<<\ catGrid, applyBlock, 0, stream.stream()>>>(\ - data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]); + data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); switch (nDims) { case 1: HANDLE_CASE(1); @@ -420,7 +421,6 @@ void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension, break; } #undef HANDLE_CASE - AT_CUDA_CHECK(cudaGetLastError()); } } } // namespace diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index ca00a3520f29..fb43dcb4c3c3 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -709,32 +709,32 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t if (inner_size == 1) { dim3 grid(outer_size); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] { - using accscalar_t = acc_type; - if (!half_to_float) { - if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { - dispatch_softmax_forward( - output.data_ptr(), input.data_ptr(), dim_size, dim_size, outer_size); - } else { - constexpr int ILP = sizeof(float4) / sizeof(scalar_t); - dim3 block = SoftMax_getBlockSize(ILP, dim_size); - cunn_SoftMaxForward - <<>>( - output.data_ptr(), input.data_ptr(), dim_size - ); - } - } else { - if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { - dispatch_softmax_forward( - output.data_ptr(), input.data_ptr(), dim_size, dim_size, outer_size); + using accscalar_t = acc_type; + if (!half_to_float) { + if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { + dispatch_softmax_forward( + output.data_ptr(), input.data_ptr(), dim_size, dim_size, outer_size); + } else { + constexpr int ILP = sizeof(float4) / sizeof(scalar_t); + dim3 block = SoftMax_getBlockSize(ILP, dim_size); + cunn_SoftMaxForward + <<>>( + output.data_ptr(), input.data_ptr(), dim_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } } else { - constexpr int ILP = sizeof(float4) / sizeof(accscalar_t); - dim3 block = SoftMax_getBlockSize(ILP, dim_size); - cunn_SoftMaxForward - <<>>( - output.data_ptr(), input.data_ptr(), dim_size - ); + if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { + dispatch_softmax_forward( + output.data_ptr(), input.data_ptr(), dim_size, dim_size, outer_size); + } else { + constexpr int ILP = sizeof(float4) / sizeof(accscalar_t); + dim3 block = SoftMax_getBlockSize(ILP, dim_size); + cunn_SoftMaxForward + <<>>( + output.data_ptr(), input.data_ptr(), dim_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } } - } }); // This kernel runs in a 2D grid, where each application along y dimension has a fixed // outer_size, and runs in parallel over inner_size. Dimension x is parallel over outer_size. @@ -743,29 +743,28 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t uint32_t smem_size; dim3 grid, block; AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] { - using accscalar_t = acc_type; - if (!half_to_float) { - SpatialSoftMax_getLaunchSizes( - &cunn_SpatialSoftMaxForward, - outer_size, dim_size, inner_size, - grid, block, smem_size); - cunn_SpatialSoftMaxForward - <<>>( - output.data_ptr(), input.data_ptr(), outer_size, dim_size, inner_size - ); - } else { - SpatialSoftMax_getLaunchSizes( - &cunn_SpatialSoftMaxForward, - outer_size, dim_size, inner_size, - grid, block, smem_size); - cunn_SpatialSoftMaxForward - <<>>( - output.data_ptr(), input.data_ptr(), outer_size, dim_size, inner_size - ); - } + using accscalar_t = acc_type; + if (!half_to_float) { + SpatialSoftMax_getLaunchSizes( + &cunn_SpatialSoftMaxForward, + outer_size, dim_size, inner_size, + grid, block, smem_size); + cunn_SpatialSoftMaxForward + <<>>( + output.data_ptr(), input.data_ptr(), outer_size, dim_size, inner_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + SpatialSoftMax_getLaunchSizes( + &cunn_SpatialSoftMaxForward, + outer_size, dim_size, inner_size, + grid, block, smem_size); + cunn_SpatialSoftMaxForward + <<>>( + output.data_ptr(), input.data_ptr(), outer_size, dim_size, inner_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } }); } - AT_CUDA_CHECK(cudaGetLastError()); } return output; } @@ -807,6 +806,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t <<>>( gI.data_ptr(), output.data_ptr(), grad.data_ptr(), dim_size ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } else { if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { @@ -819,6 +819,7 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t <<>>( gI.data_ptr(), output.data_ptr(), grad.data_ptr(), dim_size ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } }); @@ -826,33 +827,35 @@ Tensor host_softmax_backward(const Tensor &grad_, const Tensor &output_, int64_t uint32_t smem_size; dim3 grid, block; AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, gI.scalar_type(), "host_softmax_backward", [&] { - using accscalar_t = acc_type; - if (!half_to_float) { - SpatialSoftMax_getLaunchSizes( - &cunn_SpatialSoftMaxBackward, - outer_size, dim_size, inner_size, - grid, block, smem_size); - - cunn_SpatialSoftMaxBackward - <<>>( - gI.data_ptr(), output.data_ptr(), grad.data_ptr(), - outer_size, dim_size, inner_size - ); - } else { - SpatialSoftMax_getLaunchSizes( - &cunn_SpatialSoftMaxBackward, - outer_size, dim_size, inner_size, - grid, block, smem_size); - - cunn_SpatialSoftMaxBackward - <<>>( - gI.data_ptr(), output.data_ptr(), grad.data_ptr(), - outer_size, dim_size, inner_size - ); - } + using accscalar_t = acc_type; + if (!half_to_float) { + SpatialSoftMax_getLaunchSizes( + &cunn_SpatialSoftMaxBackward, + outer_size, dim_size, inner_size, + grid, block, smem_size); + + cunn_SpatialSoftMaxBackward + <<>>( + gI.data_ptr(), output.data_ptr(), grad.data_ptr(), + outer_size, dim_size, inner_size + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + SpatialSoftMax_getLaunchSizes( + &cunn_SpatialSoftMaxBackward, + outer_size, dim_size, inner_size, + grid, block, smem_size); + + cunn_SpatialSoftMaxBackward + <<>>( + gI.data_ptr(), output.data_ptr(), grad.data_ptr(), + outer_size, dim_size, inner_size + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } }); } - AT_CUDA_CHECK(cudaGetLastError()); + return gI; } } diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu index 59b07653593e..33fc4a18bffa 100644 --- a/aten/src/ATen/native/cuda/Sorting.cu +++ b/aten/src/ATen/native/cuda/Sorting.cu @@ -204,6 +204,7 @@ struct KthValueLauncher { self_info.strides[collapse_self_dim], values_info, indices_info); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }; @@ -238,6 +239,7 @@ struct MedianLauncher { num_slices, self_info.strides[collapse_self_dim], ignore_nan); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }; @@ -290,8 +292,6 @@ void kthvalue_cuda_template( values.squeeze_(dim); indices.squeeze_(dim); } - - AT_CUDA_CHECK(cudaGetLastError()); } std::tuple kthvalue_out_impl_cuda( @@ -371,8 +371,6 @@ std::tuple median_with_indices_impl( vals, inds, in, dim, MedianLauncher(ignore_nan)); } }); - - AT_CUDA_CHECK(cudaGetLastError()); } guard.reset(); diff --git a/aten/src/ATen/native/cuda/WeightNorm.cu b/aten/src/ATen/native/cuda/WeightNorm.cu index d90dc03007fd..8261eda01a3c 100644 --- a/aten/src/ATen/native/cuda/WeightNorm.cu +++ b/aten/src/ATen/native/cuda/WeightNorm.cu @@ -394,14 +394,14 @@ std::tuple weight_norm_cuda g.data_ptr(), fast_dim_size, slower_dims_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught // until a later error check on a synchronizing CUDA call. Unfortunately, without manually - // synchronizing here, this is the best we can do. - AT_CUDA_CHECK(cudaGetLastError()); + // synchronizing here, the foregoing is the best we can do. return std::tuple{w, norms}; } @@ -486,14 +486,14 @@ std::tuple weight_norm_cuda_backward saved_norms.data_ptr(), fast_dim_size, slower_dims_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } // The kernel execution is asynchronous, so this will only catch errors on the kernel launch, // not the kernel's execution. Errors in kernel execution aren't guaranteed to be caught // until a later error check on a synchronizing CUDA call. Unfortunately, without manually - // synchronizing here, this is the best we can do. - AT_CUDA_CHECK(cudaGetLastError()); + // synchronizing here, the foregoing is the best we can do. return std::tuple{grad_v, grad_g}; } From 046ea6696dde24253bea07a53c757738d4f96b43 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Tue, 8 Dec 2020 03:41:19 -0800 Subject: [PATCH 030/250] Enable faithful API for all ops (#47711) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47711 Seems we generated the declaration but the definition only for c10-full ops. We should also generate the definition for non-c10-full ops. This makes future migrations of ops from non-c10-full to c10-full have a lower impact on the C++ API. ghstack-source-id: 118064755 Test Plan: waitforsandcastle Reviewed By: bhosmer Differential Revision: D24835006 fbshipit-source-id: 8f5c3c0ffcdc9b479ca3785d57da16db508795f5 --- tools/codegen/gen.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 4db060acd401..bd76fc11f670 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -480,8 +480,7 @@ def generate_defn(sig: CppSignature) -> str: result = generate_defn(sig_group.signature) if sig_group.faithful_signature is not None: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - result += generate_defn(sig_group.faithful_signature) + result += generate_defn(sig_group.faithful_signature) return result From 3ef36dca8ec11540e35876c5e04c4f3ed63e585a Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Tue, 8 Dec 2020 03:41:19 -0800 Subject: [PATCH 031/250] Faithful out arguments (#47712) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47712 This adds a faithful API for ops with out arguments, as described in https://docs.google.com/document/d/1h7nBibRwkRLQ8rsPhfALlwWR0QbkdQm30u4ZBwmaps8/edit# . After this, an op will generate the following overloads for the C++ API: ```cpp // Generated from the aten::abs operator (NOT from aten::abs.out) Tensor at::abs(Tensor& self) // Generated from the aten::abs.out operator Tensor& at::abs(Tensor& self, Tensor& out) Tensor& at::abs_out(Tensor& out, Tensor& self) ``` This is an important step towards making those ops c10-full (it allows VariableType, XLA and other backends to ignore reordering and just call through with the same argument order), but this does not make any of those ops c10-full yet. It enables the faithful API independent from c10-fullness. That means the API is more consistent with the same API for all ops and making an op c10-full in the future will not trigger future C++ API changes. ghstack-source-id: 118068091 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D24835252 fbshipit-source-id: dedfabd07140fc8347bbf16ff219aad3b20f2870 --- tools/codegen/api/cpp.py | 15 ++++-- tools/codegen/api/dispatcher.py | 23 ++++++++-- tools/codegen/api/native.py | 3 +- tools/codegen/api/python.py | 4 +- tools/codegen/api/types.py | 81 ++++++++++++++------------------- tools/codegen/gen.py | 32 +++++++++---- 6 files changed, 92 insertions(+), 66 deletions(-) diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index b20497b5a82c..ea7179fdc599 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -23,10 +23,14 @@ # BTW: policy on name collisions: we try not to have types with # collisions, but functions are fair game to collide -def name(func: FunctionSchema) -> str: +def name(func: FunctionSchema, *, faithful_name_for_out_overloads: bool = False) -> str: name = str(func.name.name) if func.is_out_fn(): - name += '_out' + if faithful_name_for_out_overloads: + name += '_outf' + else: + name += '_out' + return name # Translation of "value types" in JIT schema to C++ API type. Value @@ -273,10 +277,11 @@ def argument_faithful( return argument(a) def group_arguments( - func: FunctionSchema, *, method: bool + func: FunctionSchema, *, method: bool, faithful: bool, ) -> Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]]: args: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = [] - args.extend(func.arguments.out) + if not faithful: + args.extend(func.arguments.out) args.extend(func.arguments.pre_self_positional) if func.arguments.self_arg is not None: if method: @@ -288,4 +293,6 @@ def group_arguments( if func.arguments.tensor_options is not None: args.append(func.arguments.tensor_options) args.extend(func.arguments.post_tensor_options_kwarg_only) + if faithful: + args.extend(func.arguments.out) return args diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index 8f3925de0041..b95803ca4e81 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -68,7 +68,7 @@ def name(func: FunctionSchema) -> str: def arguments(func: FunctionSchema) -> Tuple[DispatcherArgument, ...]: if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return tuple(map(argument, itertools.chain(func.arguments.out, func.arguments.positional, func.arguments.kwarg_only))) + return tuple(map(argument, itertools.chain(func.arguments.positional, func.arguments.kwarg_only, func.arguments.out))) else: return tuple( DispatcherArgument(type=la.type, name=la.name, argument=la.argument) @@ -137,7 +137,22 @@ def cppargument_exprs( else: assert_never(a) -def cpparguments_exprs(args: Sequence[CppArgumentPack]) -> Sequence[DispatcherExpr]: +def cpparguments_exprs(func: FunctionSchema, * , method: bool, api_is_faithful: bool) -> Sequence[DispatcherExpr]: + dispatcher_calling_convention_is_faithful = local.use_c10_dispatcher().dispatcher_uses_new_style() + arguments = cpp.group_arguments(func, method=method, faithful=dispatcher_calling_convention_is_faithful) + + if api_is_faithful: + argument_packs = tuple( + cpp.argument_faithful(a) for a in arguments + ) + else: + argument_packs = tuple( + cpp.argument(a) for a in arguments + ) + + return _cpparguments_exprs(argument_packs) + +def _cpparguments_exprs(args: Sequence[CppArgumentPack]) -> Sequence[DispatcherExpr]: tensor_options = next( (a.this for a in args if isinstance(a, CppSingleArgumentPack) and isinstance(a.this.argument, TensorOptionsArguments)), @@ -148,13 +163,13 @@ def cpparguments_exprs(args: Sequence[CppArgumentPack]) -> Sequence[DispatcherEx # I don't think this is entirely sound, but it should be reasonably # close def nativearguments_exprs(args: Sequence[NativeArgument]) -> Sequence[DispatcherExpr]: - return cpparguments_exprs([ + return _cpparguments_exprs([ CppSingleArgumentPack(CppArgument(type=a.type, name=a.name, default=None, argument=a.argument)) for a in args ]) def exprs(args: Sequence[DispatcherArgument]) -> Sequence[DispatcherExpr]: - return cpparguments_exprs([ + return _cpparguments_exprs([ CppSingleArgumentPack(CppArgument(type=a.type, name=a.name, default=None, argument=a.argument)) for a in args ]) diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py index b9e5257aef85..620e1c8cbf8c 100644 --- a/tools/codegen/api/native.py +++ b/tools/codegen/api/native.py @@ -105,4 +105,5 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> Sequen assert_never(a) def arguments(func: FunctionSchema) -> Tuple[NativeArgument, ...]: - return tuple(i for arg in cpp.group_arguments(func, method=False) for i in argument(arg)) + args = cpp.group_arguments(func, method=False, faithful=local.use_c10_dispatcher() is UseC10Dispatcher.full) + return tuple(i for arg in args for i in argument(arg)) diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 26b0f8eb8076..e7383d7cf76b 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -583,7 +583,7 @@ def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature: def has_tensor_options(f: NativeFunction) -> bool: return any(filter(lambda a: isinstance(a, TensorOptionsArguments), - cpp.group_arguments(f.func, method=False))) + cpp.group_arguments(f.func, method=False, faithful=True))) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # @@ -731,7 +731,7 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> # Skip SelfArgument if this is method. # Skip TensorOptionsArguments in C++ signature. Python side TensorOptions # arguments are created based on different rules - see below. - cpp_args = cpp.group_arguments(f.func, method=method) + cpp_args = cpp.group_arguments(f.func, method=method, faithful=True) args = tuple(a for a in cpp_args if isinstance(a, Argument)) input_arg_set = set(a.name for a in f.func.arguments.positional) diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index 32caf26f223f..796cd68f9233 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -68,7 +68,7 @@ class CppSingleArgumentPack(CppArgumentPackIface): this: CppArgument def no_default(self) -> 'CppSingleArgumentPack': - return CppSingleArgumentPack(self.this.no_default()) + return CppSingleArgumentPack(this=self.this.no_default()) @property def type(self) -> str: @@ -150,49 +150,25 @@ class CppSignature: # The schema this signature is derived from func: FunctionSchema - # Enough information about the C++ types to generate a full - # C++ type signature for this signature. I'm not too sure - # if these are the right representations, so for now this - # is intended to be more abstract. - _argument_packs: Tuple[CppArgumentPack, ...] - _returns_type: str + # Is this a C++ signature for a method, i.e. Tensor::my_op(...)? + method: bool + + # Is this a faithful C++ signature (i.e. following the JIT schema) or a convenience API + # (i.e. with a potential TensorOptions argument and out arguments in the front) + faithful: bool # Return the unpacked argument structure of this signature, # discarding information about which arguments are semantically # related to each other. def arguments(self) -> Sequence[CppArgument]: - return [sub_a for a in self._argument_packs for sub_a in a.explicit_arguments()] + return [sub_a for a in self.argument_packs() for sub_a in a.explicit_arguments()] # Return the packed argument structure of this signature. This preserves # high-level structure of the arguments so you may find it easier to do # translations working with this representation. def argument_packs(self) -> Sequence[CppArgumentPack]: - return self._argument_packs - - # Render the C++ declaration for this signature - def decl(self) -> str: - cpp_args_str = ', '.join(map(str, self.arguments())) - return f"{self._returns_type} {cpp.name(self.func)}({cpp_args_str})" - - # Render the C++ definition for this signature, not including - # the body (with curly braces) - def defn(self, name: Optional[str] = None, *, prefix: str = "") -> str: - cpp_args_str = ', '.join(a.str_no_default() for a in self.arguments()) - if name is None: - name = prefix + cpp.name(self.func) - return f"{self._returns_type} {name}({cpp_args_str})" - - # NB: This constructor knows how to disambiguate defaults when - # faithful is True. Ideally this would live as an external process - # see https://github.com/pytorch/pytorch/pull/45666 - @staticmethod - def _from_grouped_arguments( - func: FunctionSchema, - arguments: Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]], - *, - faithful: bool - ) -> 'CppSignature': - if faithful: + grouped_args = cpp.group_arguments(self.func, method=self.method, faithful=self.faithful) + if self.faithful: # Faithful signatures will ungroup arguments into argument # packs. # @@ -201,17 +177,31 @@ def _from_grouped_arguments( # principle, we should be able to do this at some later # point in time with other overload disambiguation argument_packs = tuple( - cpp.argument_faithful(a).no_default() for a in arguments + cpp.argument_faithful(a).no_default() for a in grouped_args ) else: argument_packs = tuple( - cpp.argument(a) for a in arguments + cpp.argument(a) for a in grouped_args ) - return CppSignature( - func=func, - _argument_packs=argument_packs, - _returns_type=cpp.returns_type(func.returns), - ) + return argument_packs + + def name(self) -> str: + return cpp.name(self.func, faithful_name_for_out_overloads=self.faithful) + + # Render the C++ declaration for this signature + def decl(self) -> str: + returns_type = cpp.returns_type(self.func.returns) + cpp_args_str = ', '.join(map(str, self.arguments())) + return f"{returns_type} {self.name()}({cpp_args_str})" + + # Render the C++ definition for this signature, not including + # the body (with curly braces) + def defn(self, *, prefix: str = "") -> str: + returns_type = cpp.returns_type(self.func.returns) + cpp_args_str = ', '.join(a.str_no_default() for a in self.arguments()) + name = prefix + self.name() + return f"{returns_type} {name}({cpp_args_str})" + # Represents group of all CppSignatures associated with a # FunctionSchema. Right now, that's the regular, user-visible @@ -225,13 +215,12 @@ class CppSignatureGroup: @staticmethod def from_schema(func: FunctionSchema, *, method: bool) -> 'CppSignatureGroup': - grouped_arguments = cpp.group_arguments(func, method=method) faithful_signature: Optional[CppSignature] - if any(isinstance(a, TensorOptionsArguments) for a in grouped_arguments): - faithful_signature = CppSignature._from_grouped_arguments(func, grouped_arguments, faithful=True) + if func.arguments.tensor_options is not None or len(func.arguments.out) > 0: + faithful_signature = CppSignature(func=func, faithful=True, method=method) else: faithful_signature = None - signature = CppSignature._from_grouped_arguments(func, grouped_arguments, faithful=False) + signature = CppSignature(func=func, faithful=False, method=method) return CppSignatureGroup( func=func, signature=signature, @@ -385,7 +374,7 @@ class MetaArgument: type: str name: str # structured kernels (for which MetaArgument matters) always will - # be use_c10_dispatcher full. That means JIT arguments and + # be use_c10_dispatcher full. That means JIT arguments and # meta arguments are always in 1:1 correspondence. If this is ever not true # we will have to do something more fancy here. argument: Argument diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index bd76fc11f670..3e5744207aff 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -462,10 +462,15 @@ def __call__(self, f: NativeFunction) -> Optional[str]: assert self.target is Target.DEFINITION - def generate_defn(sig: CppSignature) -> str: + def generate_defn(faithful: bool) -> str: dispatcher_sig = DispatcherSignature.from_schema(f.func) - dispatcher_exprs = dispatcher.cpparguments_exprs(sig.argument_packs()) + if faithful and sig_group.faithful_signature is not None: + sig = sig_group.faithful_signature + else: + sig = sig_group.signature + + dispatcher_exprs = dispatcher.cpparguments_exprs(f.func, method=False, api_is_faithful=faithful) dispatcher_exprs_str = ', '.join(a.expr for a in dispatcher_exprs) return f""" @@ -478,9 +483,9 @@ def generate_defn(sig: CppSignature) -> str: }} """ - result = generate_defn(sig_group.signature) + result = generate_defn(sig_group.faithful_signature is None) if sig_group.faithful_signature is not None: - result += generate_defn(sig_group.faithful_signature) + result += generate_defn(True) return result @@ -512,10 +517,16 @@ def __call__(self, f: NativeFunction) -> Optional[str]: assert self.target is Target.DEFINITION - def generate_defn(sig: CppSignature) -> str: + def generate_defn(faithful: bool) -> str: dispatcher_sig = DispatcherSignature.from_schema(f.func) - dispatcher_exprs = dispatcher.cpparguments_exprs(sig.argument_packs()) + if faithful: + sig = sig_group.faithful_signature + assert sig is not None + else: + sig = sig_group.signature + + dispatcher_exprs = dispatcher.cpparguments_exprs(f.func, method=True, api_is_faithful=faithful) dispatcher_exprs_str = ', '.join(a.expr for a in dispatcher_exprs) return f""" @@ -528,9 +539,9 @@ def generate_defn(sig: CppSignature) -> str: }} """ - result = generate_defn(sig_group.signature) + result = generate_defn(faithful=False) if sig_group.faithful_signature is not None: - result += generate_defn(sig_group.faithful_signature) + result += generate_defn(faithful=True) return result @@ -848,7 +859,10 @@ def compute_declaration_yaml(f: NativeFunction) -> object: for a in schema_order_jit_arguments ] - cpp_schema_order_types = [cpp.argument(a).type for a in schema_order_jit_arguments] + cpp_schema_order_types = [ + cpp.argument(a).type for a in schema_order_jit_arguments + ] + cpp_returns = cpp.returns_type(f.func.returns) schema_order_cpp_signature = f"{cpp_returns} ({', '.join(cpp_schema_order_types)})" From b643dbb8a4bf9850171b2de848b6b89206973972 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Tue, 8 Dec 2020 03:41:19 -0800 Subject: [PATCH 032/250] VariableType calls faithful C++ API for c10-full out ops (#47792) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47792 For operators with out arguments, VariableType previously called the out overload of the C++ API because that's all we had. We introduced a faithful C++ API that takes out arguments in schema-order in D24835252 and this PR changes VariableType to use that API instead. Note that this only applies to c10-full ops. Non-c10-full ops still call the unfaithful API. There aren't any c10-full out ops at the moment. So this PR can only be tested and evaluated together with PRs on top that make ops with out arguments c10-full. ghstack-source-id: 118068088 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D24901945 fbshipit-source-id: a99db7e4d96fcc421f9664504f87df68fe1c482f --- tools/autograd/gen_variable_type.py | 9 +++++++-- tools/autograd/utils.py | 8 +++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index f0ecd55a9b66..4948ac3af0dc 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -23,7 +23,7 @@ # differentiable subcomponents. # -from .utils import CodeTemplate, nested_dict, write +from .utils import CodeTemplate, nested_dict, write, make_out_api_name_faithful from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \ MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT from .gen_autograd_functions import uses_single_grad @@ -614,8 +614,13 @@ def save_variables( def emit_dispatch_call(api_name, input_base, unpacked_args): """ Dispatch call via function in a namespace or method on Tensor.""" if 'namespace' in declaration['method_of']: + if declaration['use_c10_dispatcher'] in ['hacky_wrapper_for_legacy_signatures', 'full']: + dispatcher_api_name = make_out_api_name_faithful(api_name) + else: + assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' + dispatcher_api_name = api_name call = CALL_DISPATCH_VIA_NAMESPACE.substitute( - api_name=api_name, + api_name=dispatcher_api_name, unpacked_args=unpacked_args) else: call = CALL_DISPATCH_VIA_METHOD.substitute( diff --git a/tools/autograd/utils.py b/tools/autograd/utils.py index b4889d219e9c..86758b5b3ff3 100644 --- a/tools/autograd/utils.py +++ b/tools/autograd/utils.py @@ -47,8 +47,14 @@ def split_name_params(prototype): def uninplace_api_name(api_name): if api_name.endswith('_') and not api_name.endswith('__'): api_name = api_name[:-1] + return unout_api_name(api_name) + +def make_out_api_name_faithful(api_name): + # Variable kernel needs to call the _outf overload instead of the _out overload + # because the _outf overload matches the argument order as it's passed into + # the variable kernel if api_name.endswith('_out'): - api_name = api_name[:-4] + api_name = api_name + 'f' return api_name From 07978bd62e0c59b75bdcbf993ccdf9e127d7bf9a Mon Sep 17 00:00:00 2001 From: Ansha Yu Date: Tue, 8 Dec 2020 05:52:48 -0800 Subject: [PATCH 033/250] [static runtime] fuse inference ops (1) (#48948) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48948 Fuse inference ops for the following inside static runtime: ConcatAddMulReplaceNaNClip CastedBatchOneHotLengths ConcatBatchMatMulBatchGather TODO: 1. add unit tests 2. add more restrictions on the graph transform (e.g. check inputs, check outputs not used elsewhere) Test Plan: Run adindexer model with static runtime and fusion; check ops ``` MKL_NUM_THREADS=1 OMP_NUM_THREADS=1 numactl -m 0 -C 3 ./buck-out/opt/gen/caffe2/caffe2/fb/predictor/ptvsc2_predictor_bench --scripted_model=/data/users/ansha/tmp/adindexer/traced_precomputation2.pt --pt_inputs=/data/users/ansha/tmp/adindexer/merge/container_precomputation_bs1.pt --iters=3000 --warmup_iters=10000 --num_threads=1 --pred_net=/data/users/ansha/tmp/adindexer/precomputation_merge_net.pb --c2_inputs=/data/users/ansha/tmp/adindexer/merge/c2_inputs_precomputation_bs1.pb --c2_sigrid_transforms_opt=1 --c2_use_memonger=1 --c2_weights=/data/users/ansha/tmp/adindexer/merge/c2_weights_precomputation.pb --pt_enable_static_runtime ``` transformed model graph contains the fused ops: P151559641 Results before fusion: P151567611 Results after fusion: P151566783 (8% speedup for bs=20, 14% speedup for bs=1) Reviewed By: hlu1 Differential Revision: D25224107 fbshipit-source-id: c8442e8ceb018879c61ce564367b1c1b9412601b --- tools/build_variables.bzl | 1 + torch/csrc/jit/runtime/static/impl.cpp | 3 + torch/csrc/jit/runtime/static/passes.cpp | 83 ++++++++++++++++++++++++ torch/csrc/jit/runtime/static/passes.h | 9 +++ 4 files changed, 96 insertions(+) create mode 100644 torch/csrc/jit/runtime/static/passes.cpp create mode 100644 torch/csrc/jit/runtime/static/passes.h diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 7e5a5e4e7f8a..146abca386eb 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -268,6 +268,7 @@ core_sources_full_mobile = [ core_sources_full = core_sources_full_mobile + [ "torch/csrc/jit/runtime/static/impl.cpp", "torch/csrc/jit/runtime/static/ops.cpp", + "torch/csrc/jit/runtime/static/passes.cpp", ] libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 07d41fb1f642..dabc19dfc4fe 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include namespace torch { @@ -21,6 +22,8 @@ void OptimizeGraph(std::shared_ptr& graph) { ConstantPropagation(graph); RemoveTensorMutation(graph); ConstantPropagation(graph); + FuseInferenceOpsForSparseNN(graph); + ConstantPropagation(graph); } void CheckGraphEligibility(const std::shared_ptr& graph) { diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp new file mode 100644 index 000000000000..a75d187b2a49 --- /dev/null +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -0,0 +1,83 @@ +#include +#include + +namespace torch { +namespace jit { + +void ConcatAddMulReplaceNaNClip(std::shared_ptr& graph) { + // TODO:: check restrictions for inputs; outputs not used elsewhere + std::string pattern = R"IR( + graph(%a, %b, %c, %d, %e, %f, %g, %h, %i, %j): + %y0 = aten::cat(%a, %b) + %y1 = aten::add(%y0, %c, %d) + %y2 = aten::mul(%y1, %e) + %y3 = aten::nan_to_num(%y2, %f, %g, %h) + %res = aten::clamp(%y3, %i, %j) + return (%res))IR"; + std::string pattern2 = R"IR( + graph(%a, %b, %c, %d, %e, %f, %g, %h, %i, %j): + %y0 = aten::cat(%a, %b) + %y1 = aten::add(%y0, %c, %d) + %y2 = aten::mul(%y1, %e) + %y3 = aten::nan_to_num_(%y2, %f, %g, %h) + %res = aten::clamp(%y3, %i, %j) + return (%res))IR"; + std::string fused_pattern = R"IR( + graph(%a, %b, %c, %d, %e, %f, %g, %h, %i, %j): + %res = fb::concat_add_mul_replacenan_clip(%c, %e, %a, %i, %j) + return (%res))IR"; + + SubgraphRewriter fuse; + fuse.RegisterRewritePattern(pattern, fused_pattern); + fuse.runOnGraph(graph); + + fuse.RegisterRewritePattern(pattern2, fused_pattern); + fuse.runOnGraph(graph); +} + +void CastedBatchOneHotLengths(std::shared_ptr& graph) { + // TODO:: check restrictions for inputs; outputs not used elsewhere + std::string pattern = R"IR( + graph(%a, %b, %c, %d, %e, %f, %g): + %y0 : Tensor = aten::to(%a, %b, %c, %c, %d) + %y1 : Tensor = fb::batch_one_hot_lengths(%y0, %e, %f) + %res : Tensor = aten::to(%y1, %g, %c, %c, %d) + return (%res))IR"; + std::string fused_pattern = R"IR( + graph(%a, %b, %c, %d, %e, %f, %g): + %res : Tensor = fb::casted_batch_one_hot_lengths(%a, %e, %f) + return (%res))IR"; + SubgraphRewriter fuse; + fuse.RegisterRewritePattern(pattern, fused_pattern); + fuse.runOnGraph(graph); +} + +void ConcatBatchMatMulBatchGather(std::shared_ptr& graph) { + // TODO:: check restrictions for inputs; outputs not used elsewhere + std::string pattern = R"IR( + graph(%a, %b, %c, %d, %e, %f): + %y0 : Tensor = aten::stack(%a, %b) + %y1 : Tensor = aten::transpose(%y0, %b, %c) + %y2 : Tensor = aten::bmm(%y0, %y1) + %y3 : Tensor = aten::flatten(%y2, %d, %e) + %res : Tensor = aten::index_select(%y3, %b, %f) + return (%res))IR"; + std::string fused_pattern = R"IR( + graph(%a, %b, %c, %d, %e, %f): + %res : Tensor = fb::concat_batch_matmul_batch_gather(%f, %a) + return (%res))IR"; + SubgraphRewriter fuse; + fuse.RegisterRewritePattern(pattern, fused_pattern); + fuse.runOnGraph(graph); +} + +void FuseInferenceOpsForSparseNN(std::shared_ptr& graph) { +#ifdef FBCODE_CAFFE2 + ConcatAddMulReplaceNaNClip(graph); + CastedBatchOneHotLengths(graph); + ConcatBatchMatMulBatchGather(graph); +#endif +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/static/passes.h b/torch/csrc/jit/runtime/static/passes.h new file mode 100644 index 000000000000..7cc9c52f7696 --- /dev/null +++ b/torch/csrc/jit/runtime/static/passes.h @@ -0,0 +1,9 @@ +#include + +namespace torch { +namespace jit { + +void FuseInferenceOpsForSparseNN(std::shared_ptr& graph); + +} // namespace jit +} // namespace torch From 39445f718c7190ee1d703be8028c8bd5c7d80f85 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Tue, 8 Dec 2020 07:03:44 -0800 Subject: [PATCH 034/250] Revert D25375885: [pytorch][PR] Reenable some BF16 tests on CUDA Test Plan: revert-hammer Differential Revision: D25375885 (https://github.com/pytorch/pytorch/commit/e3893b867fd39cf4f10a129ba9f689eebf10f82b) Original commit changeset: 2e19fe725ae9 fbshipit-source-id: 69829f3fff4d4a2d1a71bb52e90d3c7f16b27fa3 --- test/test_tensor_creation_ops.py | 3 ++- test/test_torch.py | 44 +++++++++++++------------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 9be3e6db5bf0..b355005b1c69 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -14,7 +14,7 @@ IS_WINDOWS) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, - onlyCPU, largeTensorTest, precisionOverride, dtypes, + onlyCPU, skipCUDAIfNotRocm, largeTensorTest, precisionOverride, dtypes, onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU) # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests @@ -2581,6 +2581,7 @@ def test_arange_device_vs_cpu(self, device, dtype): self.assertEqual(cpu_tensor, device_tensor) @onlyCUDA + @skipCUDAIfNotRocm def test_arange_bfloat16(self, device): ref_tensor = torch.tensor([0, 1, 2, 3], dtype=torch.bfloat16, device=device) bfloat16_tensor = torch.arange(0, 4, dtype=torch.bfloat16, device=device) diff --git a/test/test_torch.py b/test/test_torch.py index ad88128617c9..2d181c3b9400 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6316,6 +6316,10 @@ def test_copy_broadcast(self, device) -> None: torch.uint8 ] +# _types2 adds bfloat16 type to _types only on ROCm. Should eventually be unified +# with _types when bfloat16 bringup is complete on all platforms. +_types2 = _types + [torch.bfloat16] if TEST_WITH_ROCM else _types + _float_types = [torch.half, torch.float, torch.double] _complex_types = [torch.cfloat, torch.cdouble] @@ -6597,14 +6601,10 @@ def inner(self, device, dtype): ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)], 1e-2, 1e-5, 1e-5, _float_types + _complex_types, _cpu_types, False), ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False), - ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), - ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), - ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), - ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2), ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), @@ -6618,14 +6618,10 @@ def inner(self, device, dtype): ('lcm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 0, 0, 0, [torch.int16, torch.int32, torch.int64], [torch.int16, torch.int32, torch.int64], True, [onlyOnCPUAndCUDA]), - ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), - ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), - ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), - ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), # TODO: can't check negative case - cross-device copy is contiguous ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)], @@ -6709,16 +6705,12 @@ def inner(self, device, dtype): torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d), True], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), - ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), lambda t, d: [], 1e-2, 1e-1, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), - ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), - ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), - ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), - ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, - torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), + lambda t, d: [], 1e-2, 1e-1, 1e-5, _types2, _cpu_types, False), + ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False), + ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False), + ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False), + ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False), ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, 1e-5, _types, _cpu_types, False), ('sum', 'complex', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False), ('sum', 'complex_dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False), From e2befb84bc8a832ddd4ed4aff623862e5c396e5e Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 8 Dec 2020 07:47:01 -0800 Subject: [PATCH 035/250] minor README change to fix #25464 (#48970) Summary: Fixes https://github.com/pytorch/pytorch/issues/25464 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48970 Reviewed By: walterddr Differential Revision: D25396284 Pulled By: janeyx99 fbshipit-source-id: 8355c417b5c8b8865f208d7d8e8154048423afd9 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 195dffc09058..d29eacc28664 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_ex On Linux ```bash # Add LAPACK support for the GPU if needed -conda install -c pytorch magma-cuda102 # or [ magma-cuda101 | magma-cuda100 | magma-cuda92 ] depending on your cuda version +conda install -c pytorch magma-cuda110 # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo ``` On MacOS From 58c13cf685969bec4d49848399cd72c8f6834857 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Tue, 8 Dec 2020 07:51:35 -0800 Subject: [PATCH 036/250] Back out "Revert D25375885: [pytorch][PR] Reenable some BF16 tests on CUDA" Summary: Revert D25397144 69829f3fff4d4a2d1a71bb52e90d3c7f16b27fa3 Test Plan: Revert Hammer Reviewed By: janeyx99 Differential Revision: D25397572 fbshipit-source-id: 625ca2a32e4558ae4582a15697b6e1cc57cc1573 --- test/test_tensor_creation_ops.py | 3 +-- test/test_torch.py | 44 +++++++++++++++++++------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index b355005b1c69..9be3e6db5bf0 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -14,7 +14,7 @@ IS_WINDOWS) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, deviceCountAtLeast, onlyOnCPUAndCUDA, - onlyCPU, skipCUDAIfNotRocm, largeTensorTest, precisionOverride, dtypes, + onlyCPU, largeTensorTest, precisionOverride, dtypes, onlyCUDA, skipCPUIf, dtypesIfCUDA, dtypesIfCPU) # TODO: refactor tri_tests_args, _compare_trilu_indices, run_additional_tri_tests @@ -2581,7 +2581,6 @@ def test_arange_device_vs_cpu(self, device, dtype): self.assertEqual(cpu_tensor, device_tensor) @onlyCUDA - @skipCUDAIfNotRocm def test_arange_bfloat16(self, device): ref_tensor = torch.tensor([0, 1, 2, 3], dtype=torch.bfloat16, device=device) bfloat16_tensor = torch.arange(0, 4, dtype=torch.bfloat16, device=device) diff --git a/test/test_torch.py b/test/test_torch.py index 2d181c3b9400..ad88128617c9 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6316,10 +6316,6 @@ def test_copy_broadcast(self, device) -> None: torch.uint8 ] -# _types2 adds bfloat16 type to _types only on ROCm. Should eventually be unified -# with _types when bfloat16 bringup is complete on all platforms. -_types2 = _types + [torch.bfloat16] if TEST_WITH_ROCM else _types - _float_types = [torch.half, torch.float, torch.double] _complex_types = [torch.cfloat, torch.cdouble] @@ -6601,10 +6597,14 @@ def inner(self, device, dtype): ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)], 1e-2, 1e-5, 1e-5, _float_types + _complex_types, _cpu_types, False), ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _float_types_no_half, _cpu_types, False), - ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), @@ -6618,10 +6618,14 @@ def inner(self, device, dtype): ('lcm', '', _small_3d, lambda t, d: [_small_3d(t, d)], 0, 0, 0, [torch.int16, torch.int32, torch.int64], [torch.int16, torch.int32, torch.int64], True, [onlyOnCPUAndCUDA]), - ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), - ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, _types2), + ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), + ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False)), ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), # TODO: can't check negative case - cross-device copy is contiguous ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)], @@ -6705,12 +6709,16 @@ def inner(self, device, dtype): torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d), True], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False), - ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), - lambda t, d: [], 1e-2, 1e-1, 1e-5, _types2, _cpu_types, False), - ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False), - ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, _types2, _cpu_types, False), - ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False), - ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types2, _cpu_types, False), + ('prod', '', lambda t, d: _small_2d(t, d, oneish=True), lambda t, d: [], 1e-2, 1e-1, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-1, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-1, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), + ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, + torch.testing.get_all_dtypes(include_complex=False, include_bool=False), _cpu_types, False), ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, 1e-5, _types, _cpu_types, False), ('sum', 'complex', _small_2d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False), ('sum', 'complex_dim', _small_3d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _complex_types, _cpu_types, False), From c29f51642ecc41da3a6a40e7685ef4decce8bbf3 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 8 Dec 2020 07:52:52 -0800 Subject: [PATCH 037/250] Modify NEON check for ARM64 on OS X (#48982) Summary: Use CMAKE_SYSTEM_PROCESSOR rather than run sysctl Fixes https://github.com/pytorch/pytorch/issues/48874 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48982 Reviewed By: walterddr Differential Revision: D25385883 Pulled By: malfet fbshipit-source-id: 47b6dc5be8d75f6d4a66a11c564abdfe31ac90b4 --- cmake/Modules/FindARM.cmake | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cmake/Modules/FindARM.cmake b/cmake/Modules/FindARM.cmake index bd68f5f36735..acd00cfa6772 100644 --- a/cmake/Modules/FindARM.cmake +++ b/cmake/Modules/FindARM.cmake @@ -41,9 +41,7 @@ IF(CMAKE_SYSTEM_NAME MATCHES "Linux") ENDIF (OMAP4_TRUE) ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin") - EXEC_PROGRAM("/usr/sbin/sysctl -n hw.optional.arm64" OUTPUT_VARIABLE - IS_ARM64) - IF(IS_ARM64 STREQUAL "1") + IF(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") set(NEON_FOUND true CACHE BOOL "NEON available on ARM64") ENDIF() EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE From ad3fed8b90a67e1180c84a1484ddf5c99fe9fce0 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 8 Dec 2020 08:56:53 -0800 Subject: [PATCH 038/250] [BE] Fix signed-unsigned warnings (#48848) Summary: Switch to range loops when possible Replace `ptrdiff_t`(signed type) with `size_t`(unsigned type) Pull Request resolved: https://github.com/pytorch/pytorch/pull/48848 Reviewed By: walterddr Differential Revision: D25376591 Pulled By: malfet fbshipit-source-id: 9835f83b7a17b6acc20731cc89c1c11c2aa01a78 --- aten/src/ATen/BatchingRegistrations.cpp | 4 ++-- aten/src/ATen/NamedTensorUtils.cpp | 4 ++-- aten/src/ATen/TensorIterator.cpp | 4 ++-- aten/src/ATen/TensorNames.cpp | 4 ++-- aten/src/ATen/native/Convolution.cpp | 8 ++------ aten/src/ATen/native/ForeachOpsKernels.cpp | 2 +- aten/src/TH/generic/THStorage.cpp | 3 +-- aten/src/TH/generic/THStorageCopy.cpp | 5 ++--- caffe2/serialize/crc_alt.h | 8 ++++---- 9 files changed, 18 insertions(+), 24 deletions(-) diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index 16470f39ad54..0f9b31efefb9 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -941,8 +941,8 @@ Tensor new_empty_strided_batching_rule( size.size(), ") must match dimensionality of strides (", stride.size(), ")"); auto storage_size = native::storage_size_for(size, stride); - for (int64_t idx = 0; idx < physical_strides.size(); ++idx) { - physical_strides[idx] *= storage_size; + for (auto& physical_stride : physical_strides) { + physical_stride *= storage_size; } // physical_strides = [B1 * B2 * S, B2 * S, S] + strides diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index 668838877123..5f8de486dc78 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -264,11 +264,11 @@ static std::vector compute_dot_product_outnames( } std::vector outnames(num_outnames, Dimname::wildcard()); int64_t index = 0; - for (int64_t j = 0; j < tensor_names.size(); ++j) { + for (size_t j = 0; j < tensor_names.size(); ++j) { if (j == tensor_dotted_dim) continue; outnames[index++] = tensor_names[j]; } - for (int64_t j = 0; j < other_names.size(); ++j) { + for (size_t j = 0; j < other_names.size(); ++j) { if (j == other_dotted_dim) continue; outnames[index++] = other_names[j]; } diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 43acc9a070d5..0f18d941feff 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -939,8 +939,8 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso } void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) { - for (int i = 0; i < config.tensors_.size(); i++) { - operands_.emplace_back(std::move(config.tensors_[i])); + for (auto& tensor: config.tensors_) { + operands_.emplace_back(std::move(tensor)); } num_outputs_ = config.num_outputs_; } diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp index 844ff4ba2bad..a7dc0bd68036 100644 --- a/aten/src/ATen/TensorNames.cpp +++ b/aten/src/ATen/TensorNames.cpp @@ -61,10 +61,10 @@ TensorNames::TensorNames(ArrayRef names, int64_t start, int64_t end) { } TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const char* op_name) { - int64_t size_diff = std::labs(names_.size() - other.names_.size()); + size_t size_diff = std::labs(names_.size() - other.names_.size()); if (names_.size() > other.names_.size()) { - for (int64_t idx = size_diff; idx < names_.size(); ++idx) { + for (size_t idx = size_diff; idx < names_.size(); ++idx) { names_[idx] = names_[idx].unify(other.names_[idx - size_diff], op_name); } } else { diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 6dbf1e5535ed..d55c4fca6027 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -177,14 +177,10 @@ auto ConvParams::needs_64bit_indexing_no_split(const at::Tensor& input, const at int64_t outsize = 1; if (transposed) { std::vector o = conv_input_size(input.sizes(), weight.sizes(), padding, output_padding, stride, dilation, groups); - for (int64_t i = 1; i < o.size(); i++) { - outsize *= o[i]; - } + outsize = prod_intlist(o.begin() + 1, o.end()); } else { std::vector o = conv_output_size(input.sizes(), weight.sizes(), padding, stride, dilation); - for (int64_t i = 1; i < o.size(); i++) { - outsize *= o[i]; - } + outsize = prod_intlist(o.begin() + 1, o.end()); } return outsize > int_max; } diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 24ab10b25f84..90006c74346d 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -204,7 +204,7 @@ std::vector foreach_tensor_##NAME##_slow(TensorList tensors1, TensorList \ std::vector result; \ result.reserve(tensors1.size()); \ - for (int i = 0; i < tensors1.size(); i++) { \ + for (size_t i = 0; i < tensors1.size(); i++) { \ result.emplace_back(at::NAME(tensors1[i], tensors2[i])); \ } \ \ diff --git a/aten/src/TH/generic/THStorage.cpp b/aten/src/TH/generic/THStorage.cpp index 2db795719557..a085f31c740f 100644 --- a/aten/src/TH/generic/THStorage.cpp +++ b/aten/src/TH/generic/THStorage.cpp @@ -115,10 +115,9 @@ void THStorage_(resizeBytes)(THStorage* storage, ptrdiff_t size_bytes) { void THStorage_(fill)(THStorage *storage, scalar_t value) { - ptrdiff_t i; auto type_meta = caffe2::TypeMeta::Make(); size_t numel = storage->nbytes() / type_meta.itemsize(); - for (i = 0; i < numel; i++) + for (size_t i = 0; i < numel; i++) THStorage_(data)(storage)[i] = value; } diff --git a/aten/src/TH/generic/THStorageCopy.cpp b/aten/src/TH/generic/THStorageCopy.cpp index dc19deea7652..2d6ec8a05eb6 100644 --- a/aten/src/TH/generic/THStorageCopy.cpp +++ b/aten/src/TH/generic/THStorageCopy.cpp @@ -8,7 +8,7 @@ void THStorage_(copy)(THStorage *storage, THStorage *src) scalar_t *scalar_src = THStorage_(data)(src); scalar_t *data = THStorage_(data)(storage); uint64_t numel = storage->nbytes() / sizeof(scalar_t); - for (ptrdiff_t i = 0; i < numel; ++i) { + for (uint64_t i = 0; i < numel; ++i) { data[i] = scalar_src[i]; } } @@ -19,11 +19,10 @@ void THStorage_(copy)(THStorage *storage, THStorage *src) #define IMPLEMENT_THStorage_COPY(TYPENAMESRC) \ void THStorage_(copy##TYPENAMESRC)( \ THStorage * storage, TH##TYPENAMESRC##Storage * src) { \ - ptrdiff_t i; \ auto data = THStorage_(data)(storage); \ auto src_data = TH##TYPENAMESRC##Storage_data(src); \ uint64_t numel = storage->nbytes() / sizeof(scalar_t); \ - for (i = 0; i < numel; i++) \ + for (uint64_t i = 0; i < numel; i++) \ data[i] = static_cast(src_data[i]); \ } diff --git a/caffe2/serialize/crc_alt.h b/caffe2/serialize/crc_alt.h index be51083fec0e..e7c986ff89fb 100644 --- a/caffe2/serialize/crc_alt.h +++ b/caffe2/serialize/crc_alt.h @@ -680,12 +680,12 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) // put operator for one zero bit in odd odd[0] = Polynomial; // CRC-32 polynomial - for (int i = 1; i < CrcBits; i++) + for (uint32_t i = 1; i < CrcBits; i++) odd[i] = 1 << (i - 1); // put operator for two zero bits in even // same as gf2_matrix_square(even, odd); - for (int i = 0; i < CrcBits; i++) + for (uint32_t i = 0; i < CrcBits; i++) { uint32_t vec = odd[i]; even[i] = 0; @@ -695,7 +695,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) } // put operator for four zero bits in odd // same as gf2_matrix_square(odd, even); - for (int i = 0; i < CrcBits; i++) + for (uint32_t i = 0; i < CrcBits; i++) { uint32_t vec = even[i]; odd[i] = 0; @@ -711,7 +711,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) for (; lengthB > 0; lengthB >>= 1) { // same as gf2_matrix_square(a, b); - for (int i = 0; i < CrcBits; i++) + for (uint32_t i = 0; i < CrcBits; i++) { uint32_t vec = b[i]; a[i] = 0; From 274ce26fd86b277f87ba4de4e397f398a7e715af Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Tue, 8 Dec 2020 09:30:32 -0800 Subject: [PATCH 039/250] [static runtime] Add Internal Ops to the registry (#48616) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48616 This adds a couple of _out variants and then registers them to the registry. I also added the concept of "canReuse{Input,Output}" so that we can annotate tensors that are not optimizable (specifically, non-float tensors). In the future we can change this (with this D25062301) after removing `RecordFunction`, we see these results ``` BS=20 --- caffe2: 0.651617 ~ 0.666354 static runtime: 0.753481 pytorch: 0.866658 BS=1 --- caffe2: 0.0858684 ~ 0.08633 static runtime: 0.209897 pytorch: 0.232694 ``` Test Plan: standard internal test of ads model against caffe2 reference (see the scripts in this quip: https://fb.quip.com/ztERAYjuzdlr) Reviewed By: hlu1 Differential Revision: D25066823 fbshipit-source-id: 25ca181c62209a4c4304f7fe73832b13e314df80 --- torch/csrc/jit/runtime/static/impl.cpp | 31 ++++++++++++------- torch/csrc/jit/runtime/static/ops.cpp | 18 +++++++---- torch/csrc/jit/runtime/static/ops.h | 43 +++++++++++++++++++++----- 3 files changed, 66 insertions(+), 26 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index dabc19dfc4fe..35657b4ba791 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -163,23 +163,30 @@ LivenessMap(const std::shared_ptr& graph) { std::unordered_set GetOptimizableValues( const std::shared_ptr& graph) { - std::unordered_set is_out_of_place; - std::unordered_set is_not_out_of_place; + std::unordered_set can_reuse; + // values used by unsupported ops (as either inputs or outputs) + // these need to be removed from "can_reuse" after analyzing all nodes + std::unordered_set cannot_reuse; for (const auto& n : graph->nodes()) { - for (const auto& container : {n->inputs(), n->outputs()}) { - for (const auto& v : container) { - if (canRunOutOfPlace(n)) { - is_out_of_place.insert(v); - } else { - is_not_out_of_place.insert(v); - } + for (const auto& v : n->inputs()) { + if (canRunOutOfPlace(n) && canReuseInputs(n)) { + can_reuse.insert(v); + } else { + cannot_reuse.insert(v); + } + } + for (const auto& v : n->outputs()) { + if (canRunOutOfPlace(n) && canReuseOutputs(n)) { + can_reuse.insert(v); + } else { + cannot_reuse.insert(v); } } } - for (auto v : is_not_out_of_place) { - is_out_of_place.erase(v); + for (auto v : cannot_reuse) { + can_reuse.erase(v); } - return is_out_of_place; + return can_reuse; } size_t AssignRegisters( diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index ab0640bf75f1..7ddc2f5106bc 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -6,12 +6,6 @@ namespace torch { namespace jit { -namespace { -inline at::Tensor create_empty_from(const at::Tensor& t) { - return at::empty({0}, t.options()); -} -} // namespace - C10_DEFINE_REGISTRY(SROperatorRegistry, SROperatorFunctor); bool canRunOutOfPlace(Node* n) { @@ -19,6 +13,18 @@ bool canRunOutOfPlace(Node* n) { return SROperatorRegistry()->Has(op_name); } +bool canReuseInputs(Node* n) { + auto op_name = std::string(n->kind().toQualString()); + DCHECK(SROperatorRegistry()->Has(op_name)); + return SROperatorRegistry()->Create(op_name)->CanReuseInput(); +} + +bool canReuseOutputs(Node* n) { + auto op_name = std::string(n->kind().toQualString()); + DCHECK(SROperatorRegistry()->Has(op_name)); + return SROperatorRegistry()->Create(op_name)->CanReuseOutput(); +} + // TODO: expand to include all view producing ops, mostly in // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp bool canRunNatively(Node* n) { diff --git a/torch/csrc/jit/runtime/static/ops.h b/torch/csrc/jit/runtime/static/ops.h index dabff008aa20..467dac282668 100644 --- a/torch/csrc/jit/runtime/static/ops.h +++ b/torch/csrc/jit/runtime/static/ops.h @@ -10,21 +10,48 @@ using SROperator = std::function&)>; using SROpFunctor = std::function; struct SROperatorFunctor { - virtual SROperator Generate(Node*) = 0; + virtual SROperator Generate(Node*) { + std::function&)> out; + return out; + } + virtual bool CanReuseInput() { + return false; + } + virtual bool CanReuseOutput() { + return false; + } virtual ~SROperatorFunctor() = default; }; C10_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor); -#define REGISTER_OPERATOR_FUNCTOR(name, id, ...) \ - struct SROperatorFunctor_##id : public SROperatorFunctor { \ - const SROpFunctor fn = __VA_ARGS__; \ - SROperator Generate(Node* n) override { \ - return fn(n); \ - } \ - }; \ + +// TODO: reuse_inp reuse_out can be deprecated with further analysis +// try to avoid this API. +#define REGISTER_OPERATOR_FUNCTOR_OPT(name, id, reuse_inp, reuse_out, ...) \ + struct SROperatorFunctor_##id : public SROperatorFunctor { \ + const SROpFunctor fn = __VA_ARGS__; \ + bool CanReuseInput() override { \ + return reuse_inp; \ + } \ + bool CanReuseOutput() override { \ + return reuse_out; \ + } \ + SROperator Generate(Node* n) override { \ + return fn(n); \ + } \ + }; \ C10_REGISTER_CLASS(SROperatorRegistry, name, SROperatorFunctor_##id); +#define REGISTER_OPERATOR_FUNCTOR(name, id, ...) \ + REGISTER_OPERATOR_FUNCTOR_OPT(name, id, true, true, __VA_ARGS__) + +inline at::Tensor create_empty_from(const at::Tensor& t) { + return at::empty({0}, t.options()); +} + bool canRunOutOfPlace(Node* n); +bool canReuseInputs(Node* n); +bool canReuseOutputs(Node* n); std::function&)> getOutOfPlaceOperation(Node* n); From b0e919cf60c5e94c38ce64c6308a6d58896d0289 Mon Sep 17 00:00:00 2001 From: Jokeren Date: Tue, 8 Dec 2020 10:13:05 -0800 Subject: [PATCH 040/250] Avoid initializing gradInput twice in the backward phase of replication (#48890) Summary: https://github.com/pytorch/pytorch/issues/48889 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48890 Reviewed By: zhangguanheng66 Differential Revision: D25375697 Pulled By: ezyang fbshipit-source-id: fd6f6089be44e68c4557b923550c7cadb90d739a --- aten/src/ATen/native/cuda/ReplicationPadding.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu index c80a98ddf13b..8f164c8476f7 100644 --- a/aten/src/ATen/native/cuda/ReplicationPadding.cu +++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu @@ -793,7 +793,7 @@ Tensor replication_pad1d_backward_cuda( // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("replication_pad1d_backward_cuda"); - auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); replication_pad1d_backward_out_cuda_template( gradInput, gradOutput, input, paddingSize); return gradInput; @@ -841,7 +841,7 @@ Tensor replication_pad2d_backward_cuda( // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("replication_pad2d_backward_cuda"); - auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); replication_pad2d_backward_out_cuda_template( gradInput, gradOutput, input, paddingSize); return gradInput; @@ -889,7 +889,7 @@ Tensor replication_pad3d_backward_cuda( // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("replication_pad3d_backward_cuda"); - auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto gradInput = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); replication_pad3d_backward_out_cuda_template( gradInput, gradOutput, input, paddingSize); return gradInput; From b89c32849352bf2fbb8f49749aa2fb0305a38c96 Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Tue, 8 Dec 2020 10:34:02 -0800 Subject: [PATCH 041/250] Add fftw3 cmake as alternative for FFT/DFT (#48808) Summary: added cmake discovery in Dependencies.cmake for fftw3. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48808 Reviewed By: janeyx99 Differential Revision: D25375320 Pulled By: walterddr fbshipit-source-id: cde3afc51eef9c621c7d19be7ad7573fc8b838c2 --- aten/src/ATen/Config.h.in | 1 + cmake/Dependencies.cmake | 15 +++++++++++++++ cmake/Summary.cmake | 1 + 3 files changed, 17 insertions(+) diff --git a/aten/src/ATen/Config.h.in b/aten/src/ATen/Config.h.in index 58c06c63535d..38326491bed8 100644 --- a/aten/src/ATen/Config.h.in +++ b/aten/src/ATen/Config.h.in @@ -8,6 +8,7 @@ #define AT_MKLDNN_ENABLED() @AT_MKLDNN_ENABLED@ #define AT_MKL_ENABLED() @AT_MKL_ENABLED@ +#define AT_FFTW_ENABLED() @AT_FFTW_ENABLED@ #define AT_NNPACK_ENABLED() @AT_NNPACK_ENABLED@ #define CAFFE2_STATIC_LINK_CUDA() @CAFFE2_STATIC_LINK_CUDA_INT@ #define AT_BUILD_WITH_BLAS() @USE_BLAS@ diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index c0e54450b409..2f9ff160763b 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -195,6 +195,21 @@ elseif(INTERN_USE_EIGEN_BLAS) list(APPEND Caffe2_DEPENDENCY_LIBS eigen_blas) endif() +# ---[ FFTW +set(AT_FFTW_ENABLED 0) +set(USE_FFTW OFF) +if(USE_FFTW OR NOT MKL_FOUND) + find_library(LIBFFTW3 fftw3) + if(LIBFFTW3) + find_path(FFTW3_INCLUDE_DIR NAMES fftw3.h ONLY_CMAKE_FIND_ROOT_PATH) + if(FFTW3_INCLUDE_DIR) + SET(AT_FFTW_ENABLED 1) + SET(USE_FFTW ON) + include_directories(${FFTW3_INCLUDE_DIR}) + endif() + endif() +endif() + # ---[ Dependencies # NNPACK and family (QNNPACK, PYTORCH_QNNPACK, and XNNPACK) can download and # compile their dependencies in isolation as part of their build. These dependencies diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 92015c269083..dd9523d1b3fb 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -117,6 +117,7 @@ function(caffe2_print_configuration_summary) endif() message(STATUS " USE_METAL : ${USE_METAL}") message(STATUS " USE_PYTORCH_METAL : ${USE_PYTORCH_METAL}") + message(STATUS " USE_FFTW : ${USE_FFTW}") message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}") message(STATUS " USE_MKLDNN : ${USE_MKLDNN}") if(${CAFFE2_USE_MKLDNN}) From c92c8598a330014c94a4fad3785e08db3a67b239 Mon Sep 17 00:00:00 2001 From: James Reed Date: Tue, 8 Dec 2020 11:13:05 -0800 Subject: [PATCH 042/250] [FX][2/2] Make docstrings pretty when rendered (#48871) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48871 Test Plan: Imported from OSS Reviewed By: ansley Differential Revision: D25351588 Pulled By: jamesr66a fbshipit-source-id: 4c6fd341100594c204a35d6a3aab756e3e22297b --- torch/fx/graph.py | 21 +-------- torch/fx/node.py | 94 ++++++++++++++++++++++++++++++-------- torch/fx/symbolic_trace.py | 72 ++++++++++++++++++++++------- 3 files changed, 132 insertions(+), 55 deletions(-) diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 072aef6e3b93..ca4b8d64bb0e 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -148,26 +148,7 @@ def forward(self, x): %topk_1 : [#users=1] = call_function[target=](args = (%sum_1, 3), kwargs = {}) # noqa: B950 return topk_1 - The Node semantics are as follows: - - - ``placeholder`` represents a function input. The ``name`` attribute specifies the name this value will take on. - ``target`` is similarly the name of the argument. ``args`` holds either: 1) nothing, or 2) a single argument - denoting the default parameter of the function input. ``kwargs`` is don't-care. Placeholders correspond to - the function parameters (e.g. ``x``) in the graph printout. - - ``get_attr`` retrieves a parameter from the module hierarchy. ``name`` is similarly the name the result of the - fetch is assigned to. ``target`` is the fully-qualified name of the parameter's position in the module hierarchy. - ``args`` and ``kwargs`` are don't-care - - ``call_function`` applies a free function to some values. ``name`` is similarly the name of the value to assign - to. ``target`` is the function to be applied. ``args`` and ``kwargs`` represent the arguments to the function, - following the Python calling convention - - ``call_module`` applies a module in the module hierarchy's ``forward()`` method to given arguments. ``name`` is - as previous. ``target`` is the fully-qualified name of the module in the module hierarchy to call. - ``args`` and ``kwargs`` represent the arguments to invoke the module on, *including the self argument*. - - ``call_method`` calls a method on a value. ``name`` is as similar. ``target`` is the string name of the method - to apply to the ``self`` argument. ``args`` and ``kwargs`` represent the arguments to invoke the module on, - *including the self argument* - - ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement - in the Graph printout. + For the semantics of operations represented in the ``Graph``, please see :class:`Node`. """ def __init__(self): """ diff --git a/torch/fx/node.py b/torch/fx/node.py index 1cc94be83e7e..fd8a4bc1377c 100644 --- a/torch/fx/node.py +++ b/torch/fx/node.py @@ -21,8 +21,34 @@ ]] class Node: - def __init__(self, graph: 'Graph', name: str, op: str, target: Target, - args: Tuple[Argument, ...], kwargs: Dict[str, Argument], + """ + ``Node`` is the data structure that represents individual operations within + a ``Graph``. For the most part, Nodes represent callsites to various entities, + such as operators, methods, and Modules (some exceptions include nodes that + specify function inputs and outputs). Each ``Node`` has a function specified + by its ``op`` property. The ``Node`` semantics for each value of ``op`` are as follows: + + - ``placeholder`` represents a function input. The ``name`` attribute specifies the name this value will take on. + ``target`` is similarly the name of the argument. ``args`` holds either: 1) nothing, or 2) a single argument + denoting the default parameter of the function input. ``kwargs`` is don't-care. Placeholders correspond to + the function parameters (e.g. ``x``) in the graph printout. + - ``get_attr`` retrieves a parameter from the module hierarchy. ``name`` is similarly the name the result of the + fetch is assigned to. ``target`` is the fully-qualified name of the parameter's position in the module hierarchy. + ``args`` and ``kwargs`` are don't-care + - ``call_function`` applies a free function to some values. ``name`` is similarly the name of the value to assign + to. ``target`` is the function to be applied. ``args`` and ``kwargs`` represent the arguments to the function, + following the Python calling convention + - ``call_module`` applies a module in the module hierarchy's ``forward()`` method to given arguments. ``name`` is + as previous. ``target`` is the fully-qualified name of the module in the module hierarchy to call. + ``args`` and ``kwargs`` represent the arguments to invoke the module on, *including the self argument*. + - ``call_method`` calls a method on a value. ``name`` is as similar. ``target`` is the string name of the method + to apply to the ``self`` argument. ``args`` and ``kwargs`` represent the arguments to invoke the module on, + *including the self argument* + - ``output`` contains the output of the traced function in its ``args[0]`` attribute. This corresponds to the "return" statement + in the Graph printout. + """ + def __init__(self, graph: 'Graph', name: str, op: str, target: 'Target', + args: Tuple['Argument', ...], kwargs: Dict[str, 'Argument'], type : Optional[Any] = None) -> None: self.graph = graph self.name = name # unique name of value being created @@ -60,23 +86,33 @@ def __init__(self, graph: 'Graph', name: str, op: str, target: Target, @property def next(self) -> 'Node': """ - Get the next node in the linked list + Returns the next ``Node`` in the linked list of Nodes. + + Returns: + + The next ``Node`` in the linked list of Nodes. """ return self._next @property def prev(self) -> 'Node': """ - Get the previous node in the linked list + Returns the previous ``Node`` in the linked list of Nodes. + + Returns: + + The previous ``Node`` in the linked list of Nodes. """ return self._prev - def prepend(self, x: 'Node'): - """Insert x before this node in the list of nodes in the graph. - Before: p -> self - bx -> x -> ax - After: p -> x -> self - bx -> ax + def prepend(self, x: 'Node') -> None: + """ + Insert x before this node in the list of nodes in the graph. Example:: + + Before: p -> self + bx -> x -> ax + After: p -> x -> self + bx -> ax Args: x (Node): The node to put before this node. Must be a member of the same graph. @@ -87,8 +123,9 @@ def prepend(self, x: 'Node'): p._next, x._prev = x, p x._next, self._prev = self, x - def append(self, x: 'Node'): - """Insert x after this node in the list of nodes in the graph. + def append(self, x: 'Node') -> None: + """ + Insert x after this node in the list of nodes in the graph. Equvalent to ``self.next.prepend(x)`` Args: @@ -103,9 +140,12 @@ def _remove_from_list(self): @property def args(self) -> Tuple[Argument, ...]: """ - Return the tuple of arguments to this Node. The interpretation of arguments - depends on the node's opcode. See the ``fx.Graph`` docstring for more + The tuple of arguments to this ``Node``. The interpretation of arguments + depends on the node's opcode. See the :class:`Node` docstring for more information. + + Assignment to this property is allowed. All accounting of uses and users + is updated automatically on assignment. """ return self._args @@ -121,9 +161,12 @@ def args(self, a : Tuple[Argument, ...]): @property def kwargs(self) -> Dict[str, Argument]: """ - Return the dict of kwargs to this Node. The interpretation of arguments - depends on the node's opcode. See the ``fx.Graph`` docstring for more + The dict of keyword arguments to this ``Node``. The interpretation of arguments + depends on the node's opcode. See the :class:`Node` docstring for more information. + + Assignment to this property is allowed. All accounting of uses and users + is updated automatically on assignment. """ return self._kwargs @@ -141,7 +184,12 @@ def all_input_nodes(self) -> List['Node']: """ Return all Nodes that are inputs to this Node. This is equivalent to iterating over ``args`` and ``kwargs`` and only collecting the values that - are Nodes + are Nodes. + + Returns: + + List of ``Nodes`` that appear in the ``args`` and ``kwargs`` of this + ``Node``, in that order. """ all_nodes : List['Node'] = [] map_arg(self.args, lambda n: all_nodes.append(n)) @@ -149,6 +197,9 @@ def all_input_nodes(self) -> List['Node']: return all_nodes def _update_args_kwargs(self, new_args : Tuple[Argument, ...], new_kwargs : Dict[str, Argument]): + """ + This API is internal. Do *not* call it directly. + """ self._args = new_args self._kwargs = new_kwargs @@ -168,7 +219,14 @@ def __repr__(self) -> str: def replace_all_uses_with(self, replace_with : 'Node') -> List['Node']: """ Replace all uses of ``self`` in the Graph with the Node ``replace_with``. - Returns the list of nodes on which this change was made. + + Args: + + replace_with (Node): The node to replace all uses of ``self`` with. + + Returns: + + The list of Nodes on which this change was made. """ to_process = list(self.users) for use_node in to_process: diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py index 6bdc8dd1070b..69e3c708dde3 100644 --- a/torch/fx/symbolic_trace.py +++ b/torch/fx/symbolic_trace.py @@ -1,6 +1,6 @@ import inspect from types import CodeType, FunctionType -from typing import Any, Dict, Optional, List, Callable, Union +from typing import Any, Dict, Optional, Tuple, List, Callable, Union import torch from torch._C import ScriptObject # type: ignore @@ -51,21 +51,31 @@ class Tracer(TracerBase): def __init__(self): super().__init__() - def create_arg(self, a: Any) -> Argument: + def create_arg(self, a: Any) -> 'Argument': """ A method to specify the behavior of tracing when preparing values to be used as arguments to nodes in the ``Graph``. By default, the behavior includes: - - Iterate through collection types (e.g. tuple, list, dict) and recursively - call ``create_args`` on the elements. - - Given a Proxy object, return a reference to the underlying IR ``Node`` - - Given a non-Proxy Tensor object, emit IR for various cases: - - For a Parameter, emit a ``get_attr`` node referring to that Parameter - - For a non-Parameter Tensor, store the Tensor away in a special - attribute referring to that attribute. + #. Iterate through collection types (e.g. tuple, list, dict) and recursively + call ``create_args`` on the elements. + #. Given a Proxy object, return a reference to the underlying IR ``Node`` + #. Given a non-Proxy Tensor object, emit IR for various cases: + + * For a Parameter, emit a ``get_attr`` node referring to that Parameter + * For a non-Parameter Tensor, store the Tensor away in a special + attribute referring to that attribute. This method can be overridden to support more types. + + Args: + + a (Any): The value to be emitted as an ``Argument`` in the ``Graph``. + + + Returns: + + The value ``a`` converted into the appropriate ``Argument`` """ # The base tracer is used to construct Graphs when there is no associated # module hierarchy, so it can never create parameter references. @@ -115,28 +125,32 @@ def is_leaf_module(self, m: torch.nn.Module, module_qualified_name : str) -> boo their constituent ops are recorded, unless specified otherwise via this parameter. - Args - m - The module itself - module_qualified_name - The path to root of this module. For example, - if you have a module hierarchy where submodule ``foo`` contains - submodule ``bar``, which contains submodule ``baz``, that module will - appear with the qualified name ``foo.bar.baz`` here. + Args: + m (Module): The module being queried about + module_qualified_name (str): The path to root of this module. For example, + if you have a module hierarchy where submodule ``foo`` contains + submodule ``bar``, which contains submodule ``baz``, that module will + appear with the qualified name ``foo.bar.baz`` here. """ return m.__module__.startswith('torch.nn') and not isinstance(m, torch.nn.Sequential) - def path_of_module(self, mod) -> str: + def path_of_module(self, mod : torch.nn.Module) -> str: """ Helper method to find the qualified name of ``mod`` in the Module hierarchy of ``root``. For example, if ``root`` has a submodule named ``foo``, which has a submodule named ``bar``, passing ``bar`` into this function will return the string "foo.bar". + + Args: + + mod (str): The ``Module`` to retrieve the qualified name for. """ for n, p in self.root.named_modules(): if mod is p: return n raise NameError('module is not installed as a submodule') - def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args, kwargs): + def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args : Tuple[Any, ...], kwargs : Dict[str, Any]) -> Any: """ Method that specifies the behavior of this ``Tracer`` when it encounters a call to an ``nn.Module`` instance. @@ -149,6 +163,20 @@ def call_module(self, m: torch.nn.Module, forward: Callable[..., Any], args, kwa This method can be overridden to--for example--create nested traced GraphModules, or any other behavior you would want while tracing across ``Module`` boundaries. + ``Module`` boundaries. + + Args: + + m (Module): The module for which a call is being emitted + forward (Callable): The forward() method of the ``Module`` to be invoked + args (Tuple): args of the module callsite + kwargs (Dict): kwargs of the module callsite + + Return: + + The return value from the Module call. In the case that a ``call_module`` + node was emitted, this is a ``Proxy`` value. Otherwise, it is whatever + value was returned from the ``Module`` invocation. """ module_qualified_name = self.path_of_module(m) if not self.is_leaf_module(m, module_qualified_name): @@ -205,6 +233,16 @@ def trace(self, root: Union[torch.nn.Module, Callable]) -> Graph: """ Trace ``root`` and return the corresponding FX ``Graph`` representation. ``root`` can either be an ``nn.Module`` instance or a Python callable. + + + Args: + + root (Union[Module, Callable]): Either a ``Module`` or a function to be + traced through. + + Returns: + + A ``Graph`` representing the semantics of the passed-in ``root``. """ if isinstance(root, torch.nn.Module): self.root = root From dee82ef3ea1ed2cae4e891f071adff03cb4a6e0d Mon Sep 17 00:00:00 2001 From: neerajprad Date: Tue, 8 Dec 2020 11:26:01 -0800 Subject: [PATCH 043/250] Add LKJCholesky distribution (#48798) Summary: As a follow up to https://github.com/pytorch/pytorch/issues/48041, this adds the `LKJCholesky` distribution that samples the Cholesky factor of positive definite correlation matrices. This also relaxes the check on `tril_matrix_to_vec` so that it works for 2x2 matrices with `diag=-2`. cc. fehiepsi Pull Request resolved: https://github.com/pytorch/pytorch/pull/48798 Reviewed By: zhangguanheng66 Differential Revision: D25364635 Pulled By: neerajprad fbshipit-source-id: 4abf8d83086b0ad45c5096760114a2c57e555602 --- test/distributions/test_distributions.py | 61 +++++++++-- test/distributions/test_utils.py | 2 +- torch/distributions/__init__.py | 1 + torch/distributions/lkj_cholesky.py | 126 +++++++++++++++++++++++ torch/distributions/utils.py | 4 +- 5 files changed, 180 insertions(+), 14 deletions(-) create mode 100644 torch/distributions/lkj_cholesky.py diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index abba69eb472f..67a66be19d84 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -39,12 +39,13 @@ from torch.testing._internal.common_utils import TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN, load_tests from torch.testing._internal.common_cuda import TEST_CUDA from torch.autograd import grad, gradcheck +from torch.autograd.functional import jacobian from torch.distributions import (Bernoulli, Beta, Binomial, Categorical, Cauchy, Chi2, ContinuousBernoulli, Dirichlet, Distribution, Exponential, ExponentialFamily, FisherSnedecor, Gamma, Geometric, Gumbel, - HalfCauchy, HalfNormal, - Independent, Kumaraswamy, Laplace, LogisticNormal, + HalfCauchy, HalfNormal, Independent, Kumaraswamy, + LKJCholesky, Laplace, LogisticNormal, LogNormal, LowRankMultivariateNormal, MixtureSameFamily, Multinomial, MultivariateNormal, NegativeBinomial, Normal, @@ -58,7 +59,8 @@ from torch.distributions.kl import _kl_expfamily_expfamily from torch.distributions.transforms import (AffineTransform, CatTransform, ExpTransform, StackTransform, identity_transform) -from torch.distributions.utils import probs_to_logits, lazy_property +from torch.distributions.utils import (probs_to_logits, lazy_property, tril_matrix_to_vec, + vec_to_tril_matrix) from torch.nn.functional import softmax # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for @@ -246,6 +248,20 @@ def is_all_nan(tensor): 'concentration0': torch.rand(4).uniform_(1, 2).requires_grad_(), }, ]), + Example(LKJCholesky, [ + { + 'dim': 2, + 'concentration': 0.5 + }, + { + 'dim': 3, + 'concentration': torch.tensor([0.5, 1., 2.]), + }, + { + 'dim': 100, + 'concentration': 4. + }, + ]), Example(Laplace, [ { 'loc': torch.randn(5, 5, requires_grad=True), @@ -2265,10 +2281,10 @@ def test_gumbel_sample(self): 'Gumbel(loc={}, scale={})'.format(loc, scale)) def test_kumaraswamy_shape(self): - concentration1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) - concentration0 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) - concentration1_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True) - concentration0_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True) + concentration1 = torch.randn(2, 3).abs().requires_grad_() + concentration0 = torch.randn(2, 3).abs().requires_grad_() + concentration1_1d = torch.randn(1).abs().requires_grad_() + concentration0_1d = torch.randn(1).abs().requires_grad_() self.assertEqual(Kumaraswamy(concentration1, concentration0).sample().size(), (2, 3)) self.assertEqual(Kumaraswamy(concentration1, concentration0).sample((5,)).size(), (5, 2, 3)) self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample().size(), (1,)) @@ -2279,10 +2295,10 @@ def test_kumaraswamy_shape(self): # Kumaraswamy distribution is not implemented in SciPy # Hence these tests are explicit def test_kumaraswamy_mean_variance(self): - c1_1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) - c0_1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) - c1_2 = torch.tensor(torch.randn(4).abs(), requires_grad=True) - c0_2 = torch.tensor(torch.randn(4).abs(), requires_grad=True) + c1_1 = torch.randn(2, 3).abs().requires_grad_() + c0_1 = torch.randn(2, 3).abs().requires_grad_() + c1_2 = torch.randn(4).abs().requires_grad_() + c0_2 = torch.randn(4).abs().requires_grad_() cases = [(c1_1, c0_1), (c1_2, c0_2)] for i, (a, b) in enumerate(cases): m = Kumaraswamy(a, b) @@ -2534,6 +2550,29 @@ def test_continuous_bernoulli_3d(self): (2, 5, 2, 3, 5)) self.assertEqual(ContinuousBernoulli(p).sample((2,)).size(), (2, 2, 3, 5)) + def test_lkj_cholesky_log_prob(self): + def tril_cholesky_to_tril_corr(x): + x = vec_to_tril_matrix(x, -1) + diag = (1 - (x * x).sum(-1)).sqrt().diag_embed() + x = x + diag + return tril_matrix_to_vec(x @ x.T, -1) + + for dim in range(2, 5): + log_probs = [] + lkj = LKJCholesky(dim, concentration=1.) + for i in range(2): + sample = lkj.sample() + sample_tril = tril_matrix_to_vec(sample, diag=-1) + log_prob = lkj.log_prob(sample) + log_abs_det_jacobian = torch.slogdet(jacobian(tril_cholesky_to_tril_corr, sample_tril)).logabsdet + log_probs.append(log_prob - log_abs_det_jacobian) + # for concentration=1., the density is uniform over the space of all + # correlation matrices. + if dim == 2: + # for dim=2, pdf = 0.5 (jacobian adjustment factor is 0.) + self.assertTrue(all([x == torch.tensor(0.5).log() for x in log_probs])) + self.assertEqual(log_probs[0], log_probs[1]) + def test_independent_shape(self): for Dist, params in EXAMPLES: for param in params: diff --git a/test/distributions/test_utils.py b/test/distributions/test_utils.py index b58cfe39fc1c..5751246eb10a 100644 --- a/test/distributions/test_utils.py +++ b/test/distributions/test_utils.py @@ -13,7 +13,7 @@ def test_tril_matrix_to_vec(shape): mat = torch.randn(shape) n = mat.shape[-1] - for diag in range(-n + 1, n): + for diag in range(-n, n): actual = mat.tril(diag) vec = tril_matrix_to_vec(actual, diag) tril_mat = vec_to_tril_matrix(vec, diag) diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py index 51e48d58de3a..dd963ab6f7a4 100644 --- a/torch/distributions/__init__.py +++ b/torch/distributions/__init__.py @@ -93,6 +93,7 @@ from .kl import kl_divergence, register_kl from .kumaraswamy import Kumaraswamy from .laplace import Laplace +from .lkj_cholesky import LKJCholesky from .log_normal import LogNormal from .logistic_normal import LogisticNormal from .lowrank_multivariate_normal import LowRankMultivariateNormal diff --git a/torch/distributions/lkj_cholesky.py b/torch/distributions/lkj_cholesky.py new file mode 100644 index 000000000000..cdbfe5be55bb --- /dev/null +++ b/torch/distributions/lkj_cholesky.py @@ -0,0 +1,126 @@ +""" +This closely follows the implementation in NumPyro (https://github.com/pyro-ppl/numpyro). + +Original copyright notice: + +# Copyright: Contributors to the Pyro project. +# SPDX-License-Identifier: Apache-2.0 +""" + +import math + +import torch +from torch.distributions import constraints, Beta +from torch.distributions.distribution import Distribution +from torch.distributions.utils import broadcast_all + + +class LKJCholesky(Distribution): + r""" + LKJ distribution for lower Cholesky factor of correlation matrices. + The distribution is controlled by ``concentration`` parameter :math:`\eta` + to make the probability of the correlation matrix :math:`M` generated from + a Cholesky factor propotional to :math:`\det(M)^{\eta - 1}`. Because of that, + when ``concentration == 1``, we have a uniform distribution over Cholesky + factors of correlation matrices. Note that this distribution samples the + Cholesky factor of correlation matrices and not the correlation matrices + themselves and thereby differs slightly from the derivations in [1] for + the `LKJCorr` distribution. For sampling, this uses the Onion method from + [1] Section 3. + + L ~ LKJCholesky(dim, concentration) + X = L @ L' ~ LKJCorr(dim, concentration) + + Example:: + + >>> l = LKJCholesky(3, 0.5) + >>> l.sample() # l @ l.T is a sample of a correlation 3x3 matrix + tensor([[ 1.0000, 0.0000, 0.0000], + [ 0.3516, 0.9361, 0.0000], + [-0.1899, 0.4748, 0.8593]]) + + Args: + dimension (dim): dimension of the matrices + concentration (float or Tensor): concentration/shape parameter of the + distribution (often referred to as eta) + + **References** + + [1] `Generating random correlation matrices based on vines and extended onion method`, + Daniel Lewandowski, Dorota Kurowicka, Harry Joe. + """ + arg_constraints = {'concentration': constraints.positive} + support = constraints.corr_cholesky + + def __init__(self, dim, concentration=1., validate_args=None): + if dim < 2: + raise ValueError(f'Expected dim to be an integer greater than or equal to 2. Found dim={dim}.') + self.dim = dim + self.concentration, = broadcast_all(concentration) + batch_shape = self.concentration.size() + event_shape = torch.Size((dim, dim)) + # This is used to draw vectorized samples from the beta distribution in Sec. 3.2 of [1]. + marginal_conc = self.concentration + 0.5 * (self.dim - 2) + offset = torch.arange(self.dim - 1, dtype=self.concentration.dtype, device=self.concentration.device) + offset = torch.cat([offset.new_zeros((1,)), offset]) + beta_conc1 = offset + 0.5 + beta_conc0 = marginal_conc.unsqueeze(-1) - 0.5 * offset + self._beta = Beta(beta_conc1, beta_conc0) + super(LKJCholesky, self).__init__(batch_shape, event_shape, validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(LKJCholesky, _instance) + batch_shape = torch.Size(batch_shape) + new.dim = self.dim + new.concentration = self.concentration.expand(batch_shape) + new._beta = self._beta.expand(batch_shape + (self.dim,)) + super(LKJCholesky, new).__init__(batch_shape, self.event_shape, validate_args=False) + new._validate_args = self._validate_args + return new + + def sample(self, sample_shape=torch.Size()): + # This uses the Onion method, but there are a few differences from [1] Sec. 3.2: + # - This vectorizes the for loop and also works for heterogeneous eta. + # - Same algorithm generalizes to n=1. + # - The procedure is simplified since we are sampling the cholesky factor of + # the correlation matrix instead of the correlation matrix itself. As such, + # we only need to generate `w`. + y = self._beta.sample(sample_shape).unsqueeze(-1) + u_normal = torch.randn(self._extended_shape(sample_shape), + dtype=y.dtype, + device=y.device).tril(-1) + u_hypersphere = u_normal / u_normal.norm(dim=-1, keepdim=True) + # Replace NaNs in first row + u_hypersphere[..., 0, :].fill_(0.) + w = torch.sqrt(y) * u_hypersphere + # Fill diagonal elements; clamp for numerical stability + eps = torch.finfo(w.dtype).tiny + diag_elems = torch.clamp(1 - torch.sum(w**2, dim=-1), min=eps).sqrt() + w += torch.diag_embed(diag_elems) + return w + + def log_prob(self, value): + # See: https://mc-stan.org/docs/2_25/functions-reference/cholesky-lkj-correlation-distribution.html + # The probability of a correlation matrix is proportional to + # determinant ** (concentration - 1) = prod(L_ii ^ 2(concentration - 1)) + # Additionally, the Jacobian of the transformation from Cholesky factor to + # correlation matrix is: + # prod(L_ii ^ (D - i)) + # So the probability of a Cholesky factor is propotional to + # prod(L_ii ^ (2 * concentration - 2 + D - i)) = prod(L_ii ^ order_i) + # with order_i = 2 * concentration - 2 + D - i + diag_elems = value.diagonal(dim1=-1, dim2=-2)[..., 1:] + order = torch.arange(2, self.dim + 1) + order = 2 * (self.concentration - 1).unsqueeze(-1) + self.dim - order + unnormalized_log_pdf = torch.sum(order * diag_elems.log(), dim=-1) + # Compute normalization constant (page 1999 of [1]) + dm1 = self.dim - 1 + alpha = self.concentration + 0.5 * dm1 + denominator = torch.lgamma(alpha) * dm1 + numerator = torch.mvlgamma(alpha - 0.5, dm1) + # pi_constant in [1] is D * (D - 1) / 4 * log(pi) + # pi_constant in multigammaln is (D - 1) * (D - 2) / 4 * log(pi) + # hence, we need to add a pi_constant = (D - 1) * log(pi) / 2 + pi_constant = 0.5 * dm1 * math.log(math.pi) + normalize_term = pi_constant + numerator - denominator + return unnormalized_log_pdf - normalize_term diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py index 380b98785f6c..84f45f1d33cf 100644 --- a/torch/distributions/utils.py +++ b/torch/distributions/utils.py @@ -118,8 +118,8 @@ def tril_matrix_to_vec(mat, diag=0): which comprises of lower triangular elements from the matrix in row order. """ n = mat.shape[-1] - if not torch._C._get_tracing_state() and (diag <= -n or diag >= n): - raise ValueError(f'diag ({diag}) provided is outside [{-n+1}, {n-1}].') + if not torch._C._get_tracing_state() and (diag < -n or diag >= n): + raise ValueError(f'diag ({diag}) provided is outside [{-n}, {n-1}].') arange = torch.arange(n, device=mat.device) tril_mask = arange < arange.view(-1, 1) + (diag + 1) vec = mat[..., tril_mask] From 0fb9d36660906a5d40d8ea523e16f981f8c11399 Mon Sep 17 00:00:00 2001 From: Sam Estep Date: Tue, 8 Dec 2020 13:17:48 -0800 Subject: [PATCH 044/250] Delete ATen mirror stuff (#49028) Summary: These files refer to https://travis-ci.org/github/zdevito/ATen and https://github.com/zdevito/ATen which were last updated in 2018 and 2019 respectively. According to zdevito: > yeah, all of that stuff can be deleted > was from a time when ATen was a separate repo from pytorch Pull Request resolved: https://github.com/pytorch/pytorch/pull/49028 Reviewed By: zdevito Differential Revision: D25401810 Pulled By: samestep fbshipit-source-id: a8eea7382f91e1aee6f45552645e6d53825fe5a7 --- .travis.aten.yml | 31 ------------------------------- tools/README.md | 4 +--- tools/aten_mirror.sh | 33 --------------------------------- 3 files changed, 1 insertion(+), 67 deletions(-) delete mode 100644 .travis.aten.yml delete mode 100755 tools/aten_mirror.sh diff --git a/.travis.aten.yml b/.travis.aten.yml deleted file mode 100644 index 242584549625..000000000000 --- a/.travis.aten.yml +++ /dev/null @@ -1,31 +0,0 @@ -# https://travis-ci.org/zdevito/ATen -language: python -python: - - 2.7 - - 3.6 - -dist: trusty - -before_install: - - sudo apt-get install -qq valgrind - -install: - - travis_retry pip install pyyaml typing - -script: - - cd aten - - mkdir build install - - cd build - - cmake .. -DUSE_CUDA=OFF -DCMAKE_INSTALL_PREFIX=../install - - make install - - ../tools/run_tests.sh . - - cd .. - - tools/test_install.sh $(pwd)/install $(pwd) - -matrix: - fast_finish: true - include: - env: LINT_CHECK - python: "2.7" - install: pip install flake8-mypy - script: flake8 diff --git a/tools/README.md b/tools/README.md index 5f915d510f86..527351d1c84a 100644 --- a/tools/README.md +++ b/tools/README.md @@ -24,7 +24,7 @@ Build system pieces: * [setup_helpers](setup_helpers) - Helper code for searching for third-party dependencies on the user system. * [build_pytorch_libs.py](build_pytorch_libs.py) - cross-platform script that - builds all of the constituent libraries of PyTorch, + builds all of the constituent libraries of PyTorch, but not the PyTorch Python extension itself. * [build_libtorch.py](build_libtorch.py) - Script for building libtorch, a standalone C++ library without Python support. This @@ -52,8 +52,6 @@ Important if you want to run on AMD GPU: Tools which are only situationally useful: -* [aten_mirror.sh](aten_mirror.sh) - Mirroring script responsible - for keeping https://github.com/zdevito/ATen up-to-date. * [docker](docker) - Dockerfile for running (but not developing) PyTorch, using the official conda binary distribution. Context: https://github.com/pytorch/pytorch/issues/1619 diff --git a/tools/aten_mirror.sh b/tools/aten_mirror.sh deleted file mode 100755 index 6c787bbda568..000000000000 --- a/tools/aten_mirror.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/sh - -# This script is run by a cronjob managed by @zdevito -# which mirrors the ATen-specific directories of PyTorch -# to zdevito/ATen, for ease of use of projects that wish -# to depend solely on ATen. -# -# See also .travis.aten.yml, which is the Travis configuration -# for the ATen project (and ensures ATen is separately -# buildable.) - -if [[ -z "$EXTRACTED_REPO" ]]; then - echo "Need to set envvar EXTRACTED_REPO" - exit 1 -fi -if [[ -z "$FULL_REPO" ]]; then - echo "Need to set envvar FULL_REPO" - exit 1 -fi -rm -rf aten-export-repo -git clone $EXTRACTED_REPO aten-export-repo -cd aten-export-repo -git config user.name "Zach DeVito" -git config user.email "zdevito@fb.com" -git remote add fullrepo $FULL_REPO -git fetch fullrepo -git checkout -b temporary-split-branch fullrepo/master -# Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository -# and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work -git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/tbb third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)' -git checkout master -git merge temporary-split-branch -git push From 2b70bcd01416e2e1992d6aacf321f5d7cb1b9c28 Mon Sep 17 00:00:00 2001 From: Mikhail Zolotukhin Date: Tue, 8 Dec 2020 13:19:10 -0800 Subject: [PATCH 045/250] [TensorExpr] Enable inlining for output tensors too. (#48967) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48967 We previously didn't inline output tensors which resulted in correctness issues like #48533. This PR allows inlining for output tensors too - this could result in duplicated computations, but we can address that later once correctness is ensured. Performance results on FastRNNS: Before the fix: ``` Benchmarking LSTMs... name avg_fwd std_fwd avg_bwd std_bwd cudnn 10.09 0.05431 17.55 0.2108 aten 21.52 0.1276 26.7 1.471 jit 13.25 0.8748 22.47 1.73 jit_premul 11.43 0.3226 19.43 2.231 jit_premul_bias 11.84 0.2245 20.33 2.205 jit_simple 13.27 0.9906 22.15 0.9724 jit_multilayer 13.38 0.8748 22.82 1.01 py 33.55 4.837 46.41 6.333 ``` After the fix: ``` Benchmarking LSTMs... name avg_fwd std_fwd avg_bwd std_bwd cudnn 10.09 0.05979 17.45 0.1987 aten 21.21 0.144 26.43 0.7356 jit 13.01 0.2925 23.21 0.8454 jit_premul 11.4 0.3905 19.62 2.448 jit_premul_bias 11.85 0.2461 20.29 0.6592 jit_simple 13.08 0.8533 22.81 1.315 jit_multilayer 12.93 0.1095 23.57 1.459 py 31.21 2.783 44.63 6.073 ``` Differential Revision: D25383949 Test Plan: Imported from OSS Reviewed By: SplitInfinity Pulled By: ZolotukhinM fbshipit-source-id: 16f5727475109a278499bef7905f6aad18c8527a --- test/cpp/tensorexpr/test_loopnest.cpp | 43 ++++++++++++++++++++++++++ test/test_tensorexpr.py | 20 ++++++++++++ torch/csrc/jit/tensorexpr/loopnest.cpp | 19 ++++++------ 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 0a8037f28db0..cc43c41bc180 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -1484,6 +1484,49 @@ TEST(LoopNest, ScheduleInlineThreeMixedSplit) { ASSERT_THROWS_WITH(l.computeInline(a->buf()), "compound indices"); } +// Check that inlining works for output tensors too +TEST(LoopNest, ScheduleInlineOutputTensors) { + KernelScope kernel_scope; + const int M = 4; + const int N = 5; + const int K = 6; + + Tensor* x = Compute( + "x", + {{M, "m1"}, {N, "n1"}, {K, "k1"}}, + [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { + return m * n * k; + }); + Tensor* y = Compute( + "y", + {{M, "m2"}, {N, "n2"}, {K, "k2"}}, + [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { + return x->call(m, n, k) + m; + }); + + LoopNest l1({x, y}); + l1.computeInline(x->buf()); + + // would normally compare results but Rand isn't implemented in the + // SimpleIREvaluator, even if we could seed it. + Stmt* stmt1 = IRSimplifier::simplify(l1.root_stmt()); + std::ostringstream oss; + oss << *stmt1; + + // Check the IR we produced + const std::string& verification_pattern = + R"IR( +# CHECK: for (int m1 = 0; m1 < 4; m1++) +# CHECK: for (int n1 = 0; n1 < 5; n1++) +# CHECK: for (int k1 = 0; k1 < 6; k1++) +# CHECK: x[m1, n1, k1] = (n1 * m1) * k1; +# CHECK: for (int m2 = 0; m2 < 4; m2++) +# CHECK: for (int n2 = 0; n2 < 5; n2++) +# CHECK: for (int k2 = 0; k2 < 6; k2++) +# CHECK: y[m2, n2, k2] = (n2 * m2) * k2 + m2;)IR"; + torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); +} + TEST(LoopNest, ScheduleFuserStyle) { KernelScope kernel_scope; const int kVectorSize = 8; diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 5c30c312534f..6ab7899025c7 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -1595,6 +1595,26 @@ def getModule(script): torch.testing.assert_allclose(ref, test) + @unittest.skipIf(not torch.cuda.is_available(), "requires CUDA") + def test_multiple_outputs(self): + # A bug reported internally similar to the one reported in #48533 + def foo(a, b, c): + t_next = c + 1 + t5 = t_next * b + t6 = torch.unsqueeze(t_next, 1) + t7 = a * t6 + return (t7, t5, t_next) + + a = torch.rand(20, 20, dtype=torch.float32, device='cuda') + b = torch.rand(20 * 29, dtype=torch.float32, device='cuda').as_strided([20], [29]) + c = torch.ones(20, dtype=torch.int64, device='cuda') + traced = torch.jit.trace(foo, (a, b, c)) + ref = foo(a, b, c) + exp = traced(a, b, c) + exp = traced(a, b, c) + for i in range(3): + assert(torch.allclose(ref[i], exp[i])) + if __name__ == '__main__': unittest.main() diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 16eb1ec11299..c9e17c9fc896 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -542,8 +542,10 @@ Stmt* LoopNest::lowerToStmt(Tensor* t) { class FunctionInliner : public IRMutator { public: - FunctionInliner(Store* producer) - : buf_(producer->buf()), producer_(producer) { + FunctionInliner(Store* producer, std::unordered_set outputs) + : buf_(producer->buf()), + producer_(producer), + outputs_(std::move(outputs)) { for (auto* i : producer->indices()) { const Var* index_var = dynamic_cast(i); if (index_var == nullptr) { @@ -631,7 +633,9 @@ class FunctionInliner : public IRMutator { // Remove the buffer write from the inlined function. Stmt* mutate(const Store* v) override { - if (v == producer_) { + // If the buf_ is in the outputs set, keep its statement intact. Otherwise, + // remove it. + if (v == producer_ && !outputs_.count(buf_)) { in_producer_ = true; producer_ = dynamic_cast(IRMutator::mutate(v)); TORCH_INTERNAL_ASSERT(producer_ != nullptr); @@ -696,6 +700,7 @@ class FunctionInliner : public IRMutator { // In the producer's scope - we need to bind any calls to rand(). bool in_producer_ = false; std::unordered_map> random_bindings_; + std::unordered_set outputs_; }; bool LoopNest::computeInline(Stmt* s) { @@ -707,11 +712,6 @@ bool LoopNest::computeInline(Stmt* s) { } bool LoopNest::computeInline(const Buf* b) { - if (output_bufs_.count(b)) { - // Cannot inline producers of output Tensors - return false; - } - // Find producers. Store* relevant_store{nullptr}; auto stores = NodeFinder::find(root_stmt_); @@ -731,7 +731,7 @@ bool LoopNest::computeInline(const Buf* b) { } TORCH_INTERNAL_ASSERT(relevant_store); - FunctionInliner inliner(relevant_store); + FunctionInliner inliner(relevant_store, output_bufs_); root_stmt_ = root_stmt_->accept_mutator(&inliner); // No longer computing this intermediate tensor, so don't alloc it. @@ -745,6 +745,7 @@ void LoopNest::inlineIntermediateBufs() { // erased from the set 'intermediate_bufs_' in that function. std::unordered_set bufs_to_inline( intermediate_bufs_.begin(), intermediate_bufs_.end()); + bufs_to_inline.insert(output_bufs_.begin(), output_bufs_.end()); for (auto b : bufs_to_inline) { computeInline(b); } From d1fb4b4ffc0ea234daae41a4bbe8ca9fbaf97716 Mon Sep 17 00:00:00 2001 From: Sam Estep Date: Tue, 8 Dec 2020 13:23:51 -0800 Subject: [PATCH 046/250] Put Flake8 requirements into their own file (#49032) Summary: This PR moves the list of Flake8 requirements/versions out of `.github/workflows/lint.yml` and into its own file `requirements-flake8.txt`. After (if) this PR is merged, I'll modify the Flake8 installation instructions on [the "Lint as you type" wiki page](https://github.com/pytorch/pytorch/wiki/Lint-as-you-type) (and its internal counterpart) to just say to install from that new file, rather than linking to the GitHub Actions YAML file and/or giving a command with a set of packages to install that keeps becoming out-of-date. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49032 Test Plan: Either look at CI, or run locally using [act](https://github.com/nektos/act): ```sh act -P ubuntu-latest=nektos/act-environments-ubuntu:18.04 -j flake8-py3 ``` Reviewed By: janeyx99 Differential Revision: D25404037 Pulled By: samestep fbshipit-source-id: ba4d1e17172a7808435df06cba8298b2b91bb27c --- .github/workflows/lint.yml | 2 +- CONTRIBUTING.md | 2 +- requirements-flake8.txt | 8 ++++++++ 3 files changed, 10 insertions(+), 2 deletions(-) create mode 100644 requirements-flake8.txt diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4a0fb9cbf819..b04e4c82f57e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -75,7 +75,7 @@ jobs: - name: Run flake8 run: | set -eux - pip install flake8==3.8.2 flake8-bugbear==20.1.4 flake8-comprehensions==3.3.0 flake8-executable==2.0.4 flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0 + pip install -r requirements-flake8.txt flake8 --version flake8 | tee ${GITHUB_WORKSPACE}/flake8-output.txt - name: Add annotations diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6593e35e4cf9..f2981c0dbb37 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -891,7 +891,7 @@ which is in PyTorch's `requirements.txt`. ## Pre-commit tidy/linting hook We use clang-tidy and flake8 (installed with flake8-bugbear, -flake8-comprehensions, flake8-mypy, and flake8-pyi) to perform additional +flake8-comprehensions, flake8-pyi, and others) to perform additional formatting and semantic checking of code. We provide a pre-commit git hook for performing these checks, before a commit is created: diff --git a/requirements-flake8.txt b/requirements-flake8.txt new file mode 100644 index 000000000000..1e2ba252556f --- /dev/null +++ b/requirements-flake8.txt @@ -0,0 +1,8 @@ +flake8==3.8.2 +flake8-bugbear==20.1.4 +flake8-comprehensions==3.3.0 +flake8-executable==2.0.4 +flake8-pyi==20.5.0 +mccabe +pycodestyle==2.6.0 +pyflakes==2.2.0 From e8ec84864fc1ca75fae226f5d3fa816557de7ffb Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Tue, 8 Dec 2020 13:47:01 -0800 Subject: [PATCH 047/250] [StaticRuntime] Add aten::narrow (#48991) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48991 Add native impl of `aten::narrow` to skip dispatcher, because `aten::narrow` calls `aten::slice` in its implementation, here we reduce the dispatcher overhead by two-fold by calling the native impl of `aten::slice`. Reviewed By: bwasti Differential Revision: D25387119 fbshipit-source-id: c020da2556a35bc57a8a2e21fa45dd491ea516a0 --- torch/csrc/jit/runtime/static/ops.cpp | 34 +++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 7ddc2f5106bc..a87eaca745d8 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -31,13 +31,12 @@ bool canRunNatively(Node* n) { // In alphabetical order const static std::unordered_set native_nodes{ "aten::flatten", + "aten::narrow", "aten::permute", "aten::reshape", "aten::slice", "aten::transpose", "aten::to", - "aten::reshape", - "aten::slice", "prim::ListConstruct", "prim::ListUnpack", "prim::TupleConstruct"}; @@ -379,6 +378,37 @@ getNativeOperation(Node* n) { p_node->Output(0, reg) = at::native::slice(in0_t, in1_i, in2_i, in3_i, in4_i); }; + } else if (n->kind() == c10::Symbol::fromQualString("aten::narrow")) { + return [](const ProcessedNode* p_node, std::vector& reg) { + auto self = p_node->Input(0, reg).toTensor(); // self + auto dim = p_node->Input(1, reg).toInt(); // dim + int64_t start = 0; + if (p_node->Input(2, reg).isScalar()) { + start = p_node->Input(2, reg).toInt(); + } else { + auto t = p_node->Input(2, reg).toTensor(); + start = t.item(); + } + auto length = p_node->Input(3, reg).toInt(); // length + TORCH_CHECK( + self.dim() > 0, "narrow() cannot be applied to a 0-dim tensor."); + auto cur_size = self.size(dim); + if (start != cur_size && start < 0) { // start being the end is valid, but + // not a valid dim specification. + start = at::maybe_wrap_dim(start, cur_size); + } + TORCH_CHECK( + length >= 0 && start <= cur_size - length, + "start (", + start, + ") + length (", + length, + ") exceeds dimension size (", + cur_size, + ")."); + p_node->Output(0, reg) = + at::native::slice(self, dim, start, start + length, 1); + }; } else if (n->kind() == c10::Symbol::fromQualString("aten::to")) { return [](const ProcessedNode* p_node, std::vector& reg) { DCHECK(p_node->input_regs().size() == 5); From 59605811488eb07b3b8bf70a5f0b4b56b34b4a61 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 8 Dec 2020 14:23:54 -0800 Subject: [PATCH 048/250] CUDA BFloat16 batchnorm (non-cuDNN) (#44994) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/44994 Reviewed By: ailzhang Differential Revision: D25377525 Pulled By: ngimel fbshipit-source-id: 42d583bbc364532264a4d3ebaa6b4ae02a0413de --- aten/src/ATen/native/Normalization.cpp | 1 + aten/src/ATen/native/cuda/Normalization.cu | 236 ++++++++++----------- test/test_nn.py | 1 - 3 files changed, 111 insertions(+), 127 deletions(-) diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 2ddcf5bd5c16..1ac4250a9d54 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -415,6 +415,7 @@ std::tuple _batch_norm_impl_index( bool use_cudnn = false; use_cudnn = (input.is_cuda() + && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16 && (input.scalar_type() != at::kHalf || weight.scalar_type() == at::kFloat) && weight.defined() && bias.defined() diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index 4830ca149cff..186099dfde50 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -5,26 +5,24 @@ namespace at { namespace native { std::tuple batch_norm_cuda_out(Tensor& output, Tensor& save_mean, Tensor& save_invstd, const Tensor& self, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool train, double momentum, double epsilon) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_cuda", [&] { - auto mean_st = running_mean.dtype(); - auto var_st = running_var.dtype(); - TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); - bool is_half_float = std::is_same::value && mean_st == at::kFloat; - bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; - if (cuda::detail::canUse32BitIndexMath(self)) { - if (is_half_float || is_bfloat16_float) { - batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); - } else { - batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); - } + auto mean_st = running_mean.dtype(); + auto var_st = running_var.dtype(); + TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); + bool is_half_float = std::is_same::value && mean_st == at::kFloat; + bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; + if (cuda::detail::canUse32BitIndexMath(self)) { + if (is_half_float || is_bfloat16_float) { + batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); } else { - if (is_half_float || is_bfloat16_float) { - batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); - } else { - batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); - } + batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); } - }); + } else { + if (is_half_float || is_bfloat16_float) { + batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); + } else { + batch_norm_cuda_template(output, save_mean, save_invstd, self, weight, bias, running_mean, running_var, train, momentum, epsilon); + } + } }); return std::tuple(output, save_mean, save_invstd); } @@ -54,38 +52,34 @@ std::tuple batch_norm_cuda(const Tensor& self, const Ten std::tuple batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& self, const Tensor& weight, const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd, bool train, double epsilon, std::array grad_input_mask) { return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_backward_cuda", [&] { - auto mean_st = running_mean.dtype(); - auto var_st = running_var.dtype(); - TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); - bool is_half_float = std::is_same::value && mean_st == at::kFloat; - bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; - if (cuda::detail::canUse32BitIndexMath(self)) { - if (is_half_float || is_bfloat16_float) { - return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); - } else { - return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); - } + auto mean_st = running_mean.dtype(); + auto var_st = running_var.dtype(); + TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); + bool is_half_float = std::is_same::value && mean_st == at::kFloat; + bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; + if (cuda::detail::canUse32BitIndexMath(self)) { + if (is_half_float || is_bfloat16_float) { + return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); + } else { + return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); + } + } else { + if (is_half_float || is_bfloat16_float) { + return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); } else { - if (is_half_float || is_bfloat16_float) { - return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); - } else { - return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); - } + return batch_norm_backward_cuda_template(grad_out, self, weight, running_mean, running_var, save_mean, save_invstd, train, epsilon, grad_input_mask); } - }); + } }); } std::tuple batch_norm_stats_cuda(const Tensor& self, double epsilon) { return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_stats_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_stats_cuda", [&] { - if (cuda::detail::canUse32BitIndexMath(self)) { - return batch_norm_stats_cuda_template(self, epsilon); - } else { - return batch_norm_stats_cuda_template(self, epsilon); - } - }); + if (cuda::detail::canUse32BitIndexMath(self)) { + return batch_norm_stats_cuda_template(self, epsilon); + } else { + return batch_norm_stats_cuda_template(self, epsilon); + } }); } @@ -99,26 +93,24 @@ Tensor batch_norm_elemt_cuda(const Tensor& self, const Tensor& weight, const Ten Tensor& batch_norm_elemt_cuda_out(Tensor& output, const Tensor& self, const Tensor& weight, const Tensor& bias, const Tensor& mean, const Tensor& invstd, double epsilon) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_elemt", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_elemt", [&] { - auto mean_st = mean.dtype(); - auto invstd_st = invstd.dtype(); - TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types"); - bool is_half_float = std::is_same::value && mean_st == at::kFloat; - bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; - if (cuda::detail::canUse32BitIndexMath(self)) { - if (is_half_float || is_bfloat16_float) { - batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); - } else { - batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); - } + auto mean_st = mean.dtype(); + auto invstd_st = invstd.dtype(); + TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types"); + bool is_half_float = std::is_same::value && mean_st == at::kFloat; + bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; + if (cuda::detail::canUse32BitIndexMath(self)) { + if (is_half_float || is_bfloat16_float) { + batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); + } else { + batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); + } + } else { + if (is_half_float || is_bfloat16_float) { + batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); } else { - if (is_half_float || is_bfloat16_float) { - batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); - } else { - batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); - } + batch_norm_elemt_cuda_template(output, self, weight, bias, mean, invstd, epsilon); } - }); + } }); return output; } @@ -137,95 +129,87 @@ std::tuple batch_norm_gather_stats_with_counts_cuda(const Tensor const Tensor& running_var, double momentum, double epsilon, const Tensor& counts) { return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, running_mean.scalar_type(), "batch_norm_update_stats_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_update_stats_cuda", [&] { - using accscalar_t = at::acc_type; - if (cuda::detail::canUse32BitIndexMath(self)) { - return batch_norm_gather_stats_cuda_template(mean, invstd, running_mean, running_var, momentum, epsilon, counts); - } else { - return batch_norm_gather_stats_cuda_template(mean, invstd, running_mean, running_var, momentum, epsilon, counts); - } - }); + using accscalar_t = at::acc_type; + if (cuda::detail::canUse32BitIndexMath(self)) { + return batch_norm_gather_stats_cuda_template(mean, invstd, running_mean, running_var, momentum, epsilon, counts); + } else { + return batch_norm_gather_stats_cuda_template(mean, invstd, running_mean, running_var, momentum, epsilon, counts); + } }); } std::tuple batch_norm_backward_reduce_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const Tensor& weight, bool input_g, bool weight_g, bool bias_g) { return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward_reduce", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_backward_reduce", [&] { - auto mean_st = mean.dtype(); - auto invstd_st = invstd.dtype(); - TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types"); - bool is_half_float = std::is_same::value && mean_st == at::kFloat; - bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; - if (cuda::detail::canUse32BitIndexMath(self)) { - if (is_half_float || is_bfloat16_float) { - return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); - } else { - return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); - } + auto mean_st = mean.dtype(); + auto invstd_st = invstd.dtype(); + TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types"); + bool is_half_float = std::is_same::value && mean_st == at::kFloat; + bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; + if (cuda::detail::canUse32BitIndexMath(self)) { + if (is_half_float || is_bfloat16_float) { + return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); + } else { + return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); + } + } else { + if (is_half_float || is_bfloat16_float) { + return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); } else { - if (is_half_float || is_bfloat16_float) { - return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); - } else { - return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); - } + return batch_norm_backward_reduce_cuda_template(self, input, mean, invstd, weight, input_g, weight_g, bias_g); } - }); + } }); } Tensor batch_norm_backward_elemt_cuda(const Tensor& self, const Tensor& input, const Tensor& mean, const Tensor& invstd, const Tensor& weight, const Tensor& mean_dy, const Tensor& mean_dy_xmu) { return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward_elemt", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_backward_elemt", [&] { - auto mean_st = mean.dtype(); - auto invstd_st = invstd.dtype(); - TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types"); - bool is_half_float = std::is_same::value && mean_st == at::kFloat; - bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; - if (cuda::detail::canUse32BitIndexMath(self)) { - if (is_half_float || is_bfloat16_float) { - return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); - } else { - return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); - } + auto mean_st = mean.dtype(); + auto invstd_st = invstd.dtype(); + TORCH_CHECK(mean_st == invstd_st, "mean and invstd need to have the same data types"); + bool is_half_float = std::is_same::value && mean_st == at::kFloat; + bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; + if (cuda::detail::canUse32BitIndexMath(self)) { + if (is_half_float || is_bfloat16_float) { + return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); } else { - if (is_half_float || is_bfloat16_float) { - return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); - } else { - return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); - } + return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); } - }); + } else { + if (is_half_float || is_bfloat16_float) { + return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); + } else { + return batch_norm_backward_elemt_cuda_template(self, input, mean, invstd, weight, mean_dy, mean_dy_xmu); + } + } }); } std::tuple batch_norm_update_stats_cuda( const Tensor& self, const Tensor& running_mean, const Tensor& running_var, double momentum) { return AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "batch_norm_backward", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "batch_norm_backward", [&] { - auto mean_st = running_mean.dtype(); - auto var_st = running_var.dtype(); - TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); - // Some workloads depend on passing in half input and float stats, which is - // usually handled by cuDNN. However, the JIT sometimes replaces cuDNN calls with this - // one so it needs to support the same case, or people start to complain. - bool is_half_float = std::is_same::value && mean_st == at::kFloat; - bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; - if (cuda::detail::canUse32BitIndexMath(self)) { - if (is_half_float || is_bfloat16_float) { - return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); - } else { - return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); - } + auto mean_st = running_mean.dtype(); + auto var_st = running_var.dtype(); + TORCH_CHECK(mean_st == var_st, "running_mean and running_var need to have the same data types"); + // Some workloads depend on passing in half input and float stats, which is + // usually handled by cuDNN. However, the JIT sometimes replaces cuDNN calls with this + // one so it needs to support the same case, or people start to complain. + bool is_half_float = std::is_same::value && mean_st == at::kFloat; + bool is_bfloat16_float = std::is_same::value && mean_st == at::kFloat; + if (cuda::detail::canUse32BitIndexMath(self)) { + if (is_half_float || is_bfloat16_float) { + return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); + } else { + return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); + } + } else { + if (is_half_float || is_bfloat16_float) { + return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); } else { - if (is_half_float || is_bfloat16_float) { - return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); - } else { - return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); - } + return batch_norm_update_stats_cuda_template(self, running_mean, running_var, momentum); } - }); + } }); } diff --git a/test/test_nn.py b/test/test_nn.py index a966d6a1f68f..67412d54eed9 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -12340,7 +12340,6 @@ def test_batchnorm_eval(self, device): self._test_batchnorm_eval(device) @onlyCUDA - @skipCUDAIfNotRocm def test_batchnorm_eval_bfloat16(self, device): self._test_batchnorm_eval(device, torch.bfloat16) From 6000481473d3d04395723967e719c3add58029d7 Mon Sep 17 00:00:00 2001 From: Wang Xu Date: Tue, 8 Dec 2020 14:44:06 -0800 Subject: [PATCH 049/250] add a unit test for large node error (#48938) Summary: add a unit test to test the situation where a node is too large to fit into any device Pull Request resolved: https://github.com/pytorch/pytorch/pull/48938 Reviewed By: zhangguanheng66 Differential Revision: D25402967 Pulled By: scottxu0730 fbshipit-source-id: a2e2a3dc70d139fa678865ef03e67fa57eff4a1d --- test/test_fx_experimental.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index 57201ded332e..0ecb44be6204 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -161,6 +161,37 @@ def forward(self, a, b): catch_runtime_error = True assert catch_runtime_error + def test_large_node_error(self): + class TestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(4, 4) + + def forward(self, a): + linear = self.linear(a) + add = linear + a + return add + + m = TestModule() + traced = symbolic_trace(m) + a = torch.rand(4) + graph_manipulation.get_size_of_all_nodes(traced, [a]) + partitioner = Partitioner() + devices = [ + Device("dev_0", 40, 0), + Device("dev_1", 40, 0), + Device("dev_2", 40, 0), + Device("dev_3", 40, 0), + Device("dev_4", 40, 0) + ] + partitioner_config = PartitionerConfig(devices, PartitionMode.size_based) + catch_runtime_error = False + try: + ret = partitioner.partition_graph(traced, m, partitioner_config) + except RuntimeError: + catch_runtime_error = True + assert catch_runtime_error + def test_partition_node_manipulation(self): class TestModule(torch.nn.Module): def forward(self, a, b): @@ -187,7 +218,6 @@ def forward(self, a, b): partition.remove_node(selected_node) assert(partition.used_mem_bytes == 80) - def test_size_based_partition(self): class TestModule(torch.nn.Module): def __init__(self): From 02b63858f21fdc34daf495b92da4af4ca0066496 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 8 Dec 2020 14:46:41 -0800 Subject: [PATCH 050/250] [CUDAExtension] support all visible cards when building a cudaextension (#48891) Summary: Currently CUDAExtension assumes that all cards are of the same type on the same machine and builds the extension with compute capability of the 0th card. This breaks later at runtime if the machine has cards of different types. Specifically resulting in: ``` RuntimeError: CUDA error: no kernel image is available for execution on the device ``` when the cards of the types that weren't compiled for are used. (and the error is far from telling what the problem is to the uninitiated) My current setup is: ``` $ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())" (8, 6) $ CUDA_VISIBLE_DEVICES=1 python -c "import torch; print(torch.cuda.get_device_capability())" (6, 1) ``` but the extension was getting built with `-gencode=arch=compute_80,code=sm_80`. This PR: * [x] introduces a loop over all visible at build time devices to ensure the extension will run on all of them (it sorts the new list generated by the loop, so that the output is easier to debug should a card with lower capacity come last) * [x] adds `+PTX` to the last entry of ccs derived from local cards (`if not _arch_list:`) to support other archs * [x] adds a digest of my conversation with ptrblck on slack in the form of docs which hopefully can help others know which archs to support, how to override defaults, when and how to add PTX, etc. Please kindly review that my prose is clear and easy to understand. ptrblck Pull Request resolved: https://github.com/pytorch/pytorch/pull/48891 Reviewed By: ngimel Differential Revision: D25358285 Pulled By: ezyang fbshipit-source-id: 8160f3adebffbc8e592ddfcc3adf153a9dc91557 --- torch/utils/cpp_extension.py | 57 +++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index b84ebe95d525..7837d8cbb570 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -828,6 +828,35 @@ def CUDAExtension(name, sources, *args, **kwargs): cmdclass={ 'build_ext': BuildExtension }) + + Compute capabilities: + + By default the extension will be compiled to run on all archs of the cards visible during the + building process of the extension, plus PTX. If down the road a new card is installed the + extension may need to be recompiled. If a visible card has a compute capability (CC) that's + newer than the newest version for which your nvcc can build fully-compiled binaries, Pytorch + will make nvcc fall back to building kernels with the newest version of PTX your nvcc does + support (see below for details on PTX). + + You can override the default behavior using `TORCH_CUDA_ARCH_LIST` to explicitly specify which + CCs you want the extension to support: + + TORCH_CUDA_ARCH_LIST="6.1 8.6" python build_my_extension.py + TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5 8.0 8.6+PTX" python build_my_extension.py + + The +PTX option causes extension kernel binaries to include PTX instructions for the specified + CC. PTX is an intermediate representation that allows kernels to runtime-compile for any CC >= + the specified CC (for example, 8.6+PTX generates PTX that can runtime-compile for any GPU with + CC >= 8.6). This improves your binary's forward compatibility. However, relying on older PTX to + provide forward compat by runtime-compiling for newer CCs can modestly reduce performance on + those newer CCs. If you know exact CC(s) of the GPUs you want to target, you're always better + off specifying them individually. For example, if you want your extension to run on 8.0 and 8.6, + "8.0+PTX" would work functionally because it includes PTX that can runtime-compile for 8.6, but + "8.0 8.6" would be better. + + Note that while it's possible to include all supported archs, the more archs get included the + slower the building process will be, as it will build a separate kernel image for each arch. + ''' library_dirs = kwargs.get('library_dirs', []) library_dirs += library_paths(cuda=True) @@ -1496,16 +1525,24 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: # If not given, determine what's best for the GPU / CUDA version that can be found if not _arch_list: - capability = torch.cuda.get_device_capability() - supported_sm = [int(arch.split('_')[1]) - for arch in torch.cuda.get_arch_list() if 'sm_' in arch] - max_supported_sm = max((sm // 10, sm % 10) for sm in supported_sm) - # Capability of the device may be higher than what's supported by the user's - # NVCC, causing compilation error. User's NVCC is expected to match the one - # used to build pytorch, so we use the maximum supported capability of pytorch - # to clamp the capability. - capability = min(max_supported_sm, capability) - arch_list = [f'{capability[0]}.{capability[1]}'] + arch_list = [] + # the assumption is that the extension should run on any of the currently visible cards, + # which could be of different types - therefore all archs for visible cards should be included + for i in range(torch.cuda.device_count()): + capability = torch.cuda.get_device_capability(i) + supported_sm = [int(arch.split('_')[1]) + for arch in torch.cuda.get_arch_list() if 'sm_' in arch] + max_supported_sm = max((sm // 10, sm % 10) for sm in supported_sm) + # Capability of the device may be higher than what's supported by the user's + # NVCC, causing compilation error. User's NVCC is expected to match the one + # used to build pytorch, so we use the maximum supported capability of pytorch + # to clamp the capability. + capability = min(max_supported_sm, capability) + arch = f'{capability[0]}.{capability[1]}' + if arch not in arch_list: + arch_list.append(arch) + arch_list = sorted(arch_list) + arch_list[-1] += '+PTX' else: # Deal with lists that are ' ' separated (only deal with ';' after) _arch_list = _arch_list.replace(' ', ';') From e538bd669509e822db10309b3f3c4e9f8f41c860 Mon Sep 17 00:00:00 2001 From: peterjc123 Date: Tue, 8 Dec 2020 14:47:24 -0800 Subject: [PATCH 051/250] [collect_env] Add candidate paths for nvidia-smi on Windows (#49021) Summary: Recently, Nvidia tries to put nvidia-smi under SystemRoot. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49021 Reviewed By: zhangguanheng66 Differential Revision: D25399831 Pulled By: ezyang fbshipit-source-id: b1ea12452012e0a3fb4703996b6104e7115a8a7f --- torch/utils/collect_env.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py index cde58f85cada..3fac12f60774 100644 --- a/torch/utils/collect_env.py +++ b/torch/utils/collect_env.py @@ -70,7 +70,7 @@ def run_and_parse_first_match(run_lambda, command, regex): def get_conda_packages(run_lambda): if get_platform() == 'win32': - system_root = os.environ.get('SystemRoot', 'C:\\Windows') + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') findstr_cmd = os.path.join(system_root, 'System32', 'findstr') grep_cmd = r'{} /R "torch numpy cudatoolkit soumith mkl magma"'.format(findstr_cmd) else: @@ -125,7 +125,7 @@ def get_running_cuda_version(run_lambda): def get_cudnn_version(run_lambda): """This will return a list of libcudnn.so; it's hard to tell which one is being used""" if get_platform() == 'win32': - system_root = os.environ.get('SystemRoot', 'C:\\Windows') + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%") where_cmd = os.path.join(system_root, 'System32', 'where') cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) @@ -163,7 +163,15 @@ def get_nvidia_smi(): # Note: nvidia-smi is currently available only on Windows and Linux smi = 'nvidia-smi' if get_platform() == 'win32': - smi = '"C:\\Program Files\\NVIDIA Corporation\\NVSMI\\%s"' % smi + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files') + legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi) + new_path = os.path.join(system_root, 'System32', smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = f'"{candidate_smi}"' + break return smi @@ -185,7 +193,7 @@ def get_mac_version(run_lambda): def get_windows_version(run_lambda): - system_root = os.environ.get('SystemRoot', 'C:\\Windows') + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic') findstr_cmd = os.path.join(system_root, 'System32', 'findstr') return run_and_read_all(run_lambda, '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd)) @@ -236,7 +244,7 @@ def get_pip_packages(run_lambda): # People generally have `pip` as `pip` or `pip3` def run_with_pip(pip): if get_platform() == 'win32': - system_root = os.environ.get('SystemRoot', 'C:\\Windows') + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') findstr_cmd = os.path.join(system_root, 'System32', 'findstr') grep_cmd = r'{} /R "numpy torch"'.format(findstr_cmd) else: From 17e71509a6a6646c96d77c8cf81c8cf20ec53a17 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Tue, 8 Dec 2020 15:33:13 -0800 Subject: [PATCH 052/250] fx quant: quick cleanup for model_device (#48906) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48906 As titled, removing some code which is no longer needed after refactors. Test Plan: CI Imported from OSS Reviewed By: jerryzh168 Differential Revision: D25363079 fbshipit-source-id: 9e4bcf63f4f1c2a2d3fb734688ba593d72495349 --- torch/quantization/fx/quantize.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index fe7dc53a8019..990954842776 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -246,7 +246,7 @@ def get_qconfig(modules, qconfig_dict, module_name, global_qconfig): def insert_observer( node: Node, observer: torch.quantization.ObserverBase, - model_device: Any, model: torch.nn.Module, + model: torch.nn.Module, activation_post_process_map: Dict[str, torch.quantization.ObserverBase], env: Dict[Any, Any], observed_graph: Graph, load_arg: Callable, observed_node_names_set: Set[str]): @@ -257,6 +257,7 @@ def insert_observer( observer: observer/fake_quantize module instance """ # respect device affinity when adding observers + model_device = assert_and_get_unique_device(model) if model_device: observer.to(model_device) # add observer module as attribute @@ -313,7 +314,6 @@ def insert_observer_for_output_of_the_node( modules: Dict[str, torch.nn.Module], model: torch.nn.Module, pattern: Any, - model_device: Any, activation_post_process_map: Dict[str, torch.quantization.ObserverBase], env: Dict[Any, Any], observed_graph: Graph, @@ -338,7 +338,7 @@ def insert_observer_for_output_of_the_node( "activation_post_process constructor not provided " + \ "for pattern:" + str(pattern) insert_observer( - node, activation_post_process_ctr(), model_device, + node, activation_post_process_ctr(), model, activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set) elif (isinstance(quantize_handler, @@ -386,13 +386,13 @@ def input_is_observed(arg): # observer for outputs new_observer = qconfig.activation() insert_observer( - node, new_observer, model_device, model, + node, new_observer, model, activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set) def insert_observer_for_input_arg_of_observed_node( node: Node, observed_node_names_set: Set[str], quants: Dict[str, Any], - model_device: Any, model: torch.nn.Module, + model: torch.nn.Module, activation_post_process_map: Dict[str, torch.quantization.ObserverBase], env: Dict[str, str], observed_graph: Graph, load_arg: Callable): @@ -401,7 +401,7 @@ def insert_observer_for_input_arg_of_observed_node( if activation_post_process_ctr is not None: insert_observer( node, activation_post_process_ctr(), - model_device, model, activation_post_process_map, + model, activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set) # A dictionary for querying the weight index for a given op @@ -565,7 +565,6 @@ def load_arg(a): get_new_observer_name = get_new_attr_name_with_prefix( 'activation_post_process_') - model_device = assert_and_get_unique_device(model) result_node : Optional[Node] = None for node in model.graph.nodes: @@ -591,14 +590,14 @@ def load_arg(a): node) insert_observer_for_output_of_the_node( node, obj, qconfig, self.modules, model, pattern, - model_device, self.activation_post_process_map, env, + self.activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set, matched_nodes) else: env[node.name] = observed_graph.node_copy(node, load_arg) insert_observer_for_input_arg_of_observed_node( node, observed_node_names_set, quants, - model_device, model, self.activation_post_process_map, env, + model, self.activation_post_process_map, env, observed_graph, load_arg) From 2668ea8087d1c51e64229b369c63aff55135e67a Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Tue, 8 Dec 2020 15:33:13 -0800 Subject: [PATCH 053/250] fx quant: move qconfig utils to utils file (#48907) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48907 Improving readability Test Plan: CI Imported from OSS Reviewed By: jerryzh168 Differential Revision: D25363078 fbshipit-source-id: 6b0161db14ccf8c3b47edf4fc760ca9a399254b2 --- torch/quantization/fx/qconfig_utils.py | 89 +++++++++++++++++++++++++ torch/quantization/fx/quantize.py | 91 +------------------------- 2 files changed, 91 insertions(+), 89 deletions(-) create mode 100644 torch/quantization/fx/qconfig_utils.py diff --git a/torch/quantization/fx/qconfig_utils.py b/torch/quantization/fx/qconfig_utils.py new file mode 100644 index 000000000000..6326a2e0da59 --- /dev/null +++ b/torch/quantization/fx/qconfig_utils.py @@ -0,0 +1,89 @@ +from .utils import _parent_name +from collections import OrderedDict +import re + +def get_flattened_qconfig_dict(qconfig_dict): + """ flatten the global, object_type and module_name qconfig + to the same qconfig_dict so that it can be used by + propagate_qconfig_ function. + "module_name_regex" is ignored for now since it's not supported + in propagate_qconfig_, but it can be fixed later. + + For example: + Input: { + "": qconfig, + "object_type": [ + (torch.add, qconfig) + ], + "module_name": [ + ("conv", qconfig) + ] + } + + Output: { + "": qconfig, + torch.add: qconfig, + "conv": qconfig + } + """ + flattened = dict() + if '' in qconfig_dict: + flattened[''] = qconfig_dict[''] + + def flatten_key(key): + if key in qconfig_dict: + for obj, qconfig in qconfig_dict[key]: + flattened[obj] = qconfig + + flatten_key('object_type') + flatten_key('module_name') + return flattened + +def convert_dict_to_ordered_dict(qconfig_dict): + """ Convert dict in qconfig_dict to ordered dict + """ + # convert a qconfig list for a type to OrderedDict + def _convert_to_ordered_dict(key, qconfig_dict): + qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, [])) + + _convert_to_ordered_dict('object_type', qconfig_dict) + _convert_to_ordered_dict('module_name_regex', qconfig_dict) + _convert_to_ordered_dict('module_name', qconfig_dict) + +def get_module_type_qconfig(qconfig_dict, module_type, fallback_qconfig): + return qconfig_dict['object_type'].get( + module_type, fallback_qconfig) + +def get_function_qconfig(qconfig_dict, function, fallback_qconfig): + return qconfig_dict['object_type'].get(function, fallback_qconfig) + +def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig): + for regex_pattern, qconfig in \ + qconfig_dict['module_name_regex'].items(): + if re.match(regex_pattern, module_name): + # first match wins + return qconfig + return fallback_qconfig + +def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig): + if module_name == '': + # module name qconfig not found + return fallback_qconfig + if module_name in qconfig_dict['module_name']: + return qconfig_dict['module_name'][module_name] + else: + parent, _ = _parent_name(module_name) + return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig) + +# get qconfig for module_name, +# fallback to module_name_regex_qconfig, module_type_qconfig, +# global_qconfig if necessary +def get_qconfig(modules, qconfig_dict, module_name, global_qconfig): + assert modules is not None + module_type_qconfig = get_module_type_qconfig( + qconfig_dict, type(modules[module_name]), global_qconfig) + module_name_regex_qconfig = get_module_name_regex_qconfig( + qconfig_dict, module_name, module_type_qconfig) + module_name_qconfig = get_module_name_qconfig( + qconfig_dict, module_name, module_name_regex_qconfig) + return module_name_qconfig diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 990954842776..73604bc3591e 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -53,9 +53,9 @@ get_custom_module_class_keys, ) -from collections import OrderedDict +from .qconfig_utils import * + import warnings -import re from typing import Optional, Dict, Any, List, Union, Tuple, Set, Callable @@ -157,93 +157,6 @@ def is_observed_standalone_module_node( return node.op == 'call_module' and \ is_observed_standalone_module(modules[node.target]) # type: ignore - -def get_flattened_qconfig_dict(qconfig_dict): - """ flatten the global, object_type and module_name qconfig - to the same qconfig_dict so that it can be used by - propagate_qconfig_ function. - "module_name_regex" is ignored for now since it's not supported - in propagate_qconfig_, but it can be fixed later. - - For example: - Input: { - "": qconfig, - "object_type": [ - (torch.add, qconfig) - ], - "module_name": [ - ("conv", qconfig) - ] - } - - Output: { - "": qconfig, - torch.add: qconfig, - "conv": qconfig - } - """ - flattened = dict() - if '' in qconfig_dict: - flattened[''] = qconfig_dict[''] - - def flatten_key(key): - if key in qconfig_dict: - for obj, qconfig in qconfig_dict[key]: - flattened[obj] = qconfig - - flatten_key('object_type') - flatten_key('module_name') - return flattened - -def convert_dict_to_ordered_dict(qconfig_dict): - """ Convert dict in qconfig_dict to ordered dict - """ - # convert a qconfig list for a type to OrderedDict - def _convert_to_ordered_dict(key, qconfig_dict): - qconfig_dict[key] = OrderedDict(qconfig_dict.get(key, [])) - - _convert_to_ordered_dict('object_type', qconfig_dict) - _convert_to_ordered_dict('module_name_regex', qconfig_dict) - _convert_to_ordered_dict('module_name', qconfig_dict) - -def get_module_type_qconfig(qconfig_dict, module_type, fallback_qconfig): - return qconfig_dict['object_type'].get( - module_type, fallback_qconfig) - -def get_function_qconfig(qconfig_dict, function, fallback_qconfig): - return qconfig_dict['object_type'].get(function, fallback_qconfig) - -def get_module_name_regex_qconfig(qconfig_dict, module_name, fallback_qconfig): - for regex_pattern, qconfig in \ - qconfig_dict['module_name_regex'].items(): - if re.match(regex_pattern, module_name): - # first match wins - return qconfig - return fallback_qconfig - -def get_module_name_qconfig(qconfig_dict, module_name, fallback_qconfig): - if module_name == '': - # module name qconfig not found - return fallback_qconfig - if module_name in qconfig_dict['module_name']: - return qconfig_dict['module_name'][module_name] - else: - parent, _ = _parent_name(module_name) - return get_module_name_qconfig(qconfig_dict, parent, fallback_qconfig) - -# get qconfig for module_name, -# fallback to module_name_regex_qconfig, module_type_qconfig, -# global_qconfig if necessary -def get_qconfig(modules, qconfig_dict, module_name, global_qconfig): - assert modules is not None - module_type_qconfig = get_module_type_qconfig( - qconfig_dict, type(modules[module_name]), global_qconfig) - module_name_regex_qconfig = get_module_name_regex_qconfig( - qconfig_dict, module_name, module_type_qconfig) - module_name_qconfig = get_module_name_qconfig( - qconfig_dict, module_name, module_name_regex_qconfig) - return module_name_qconfig - def insert_observer( node: Node, observer: torch.quantization.ObserverBase, model: torch.nn.Module, From d033e185ed212a67a471881c1bc9308b769969f8 Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Tue, 8 Dec 2020 15:33:13 -0800 Subject: [PATCH 054/250] fx quant: move more functions to utils (#48908) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48908 No logic change, improving readability Test Plan: CI Imported from OSS Reviewed By: jerryzh168 Differential Revision: D25363080 fbshipit-source-id: 1d73a875bd7abf671b544ebc835432fea5306dc3 --- torch/quantization/fx/quantize.py | 98 +++---------------------------- torch/quantization/fx/utils.py | 91 ++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 89 deletions(-) diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 73604bc3591e..7da165b52309 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -51,6 +51,10 @@ _parent_name, quantize_node, get_custom_module_class_keys, + get_new_attr_name_with_prefix, + collect_producer_nodes, + graph_module_from_producer_nodes, + assert_and_get_unique_device, ) from .qconfig_utils import * @@ -70,93 +74,6 @@ # Helper Functions # ------------------------ -# Returns a function that can get a new attribute name for module with given -# prefix, for example, -# >> get_new_observer_name = get_new_attr_name_with_prefix('_observer') -# >> new_name = get_new_observer_name(module) -# new_name will be an unused attribute name on module, e.g. `_observer_1` -def get_new_attr_name_with_prefix(prefix: str) -> Callable: - def get_new_attr_name(module: torch.nn.Module): - def get_attr_name(i: int): - return prefix + str(i) - i = 0 - attr_name = get_attr_name(i) - while hasattr(module, attr_name): - i += 1 - attr_name = get_attr_name(i) - return attr_name - return get_new_attr_name - -def collect_producer_nodes(node: Node) -> Optional[List[Node]]: - r''' Starting from a target node, trace back until we hit inpu or - getattr node. This is used to extract the chain of operators - starting from getattr to the target node, for example - def forward(self, x): - observed = self.observer(self.weight) - return F.linear(x, observed) - collect_producer_nodes(observed) will either return a list of nodes that - produces the observed node or None if we can't extract a self contained - graph without free variables(inputs of the forward function). - ''' - nodes = [node] - frontier = [node] - while frontier: - node = frontier.pop() - all_args = list(node.args) + list(node.kwargs.values()) - for arg in all_args: - if not isinstance(arg, Node): - continue - if arg.op == 'placeholder': - # hit input, can't fold in this case - return None - nodes.append(arg) - if not (arg.op == 'call_function' and arg.target == getattr): - frontier.append(arg) - return nodes - -def graph_module_from_producer_nodes( - root: GraphModule, producer_nodes: List[Node]) -> GraphModule: - r''' Construct a graph module from extracted producer nodes - from `collect_producer_nodes` function - Args: - root: the root module for the original graph - producer_nodes: a list of nodes we use to construct the graph - Return: - A graph module constructed from the producer nodes - ''' - assert len(producer_nodes) > 0, 'list of producer nodes can not be empty' - # since we traced back from node to getattrr - producer_nodes.reverse() - graph = Graph() - env: Dict[Any, Any] = {} - - def load_arg(a): - return map_arg(a, lambda node: env[node]) - for producer_node in producer_nodes: - env[producer_node] = graph.node_copy(producer_node, load_arg) - graph.output(load_arg(producer_nodes[-1])) - graph_module = GraphModule(root, graph) - return graph_module - -def assert_and_get_unique_device(module: torch.nn.Module) -> Any: - """ - Returns the unique device for a module, or None if no device is found. - Throws an error if multiple devices are detected. - """ - devices = {p.device for p in module.parameters()} | \ - {p.device for p in module.buffers()} - assert len(devices) <= 1, ( - "prepare only works with cpu or single-device CUDA modules, " - "but got devices {}".format(devices) - ) - device = next(iter(devices)) if len(devices) > 0 else None - return device - -def is_observed_standalone_module_node( - node: Node, modules: Dict[str, torch.nn.Module]) -> bool: - return node.op == 'call_module' and \ - is_observed_standalone_module(modules[node.target]) # type: ignore - def insert_observer( node: Node, observer: torch.quantization.ObserverBase, model: torch.nn.Module, @@ -764,8 +681,11 @@ def insert_quantize_node(node): quantized = False else: assert obj is not None - is_standalone_module_node = is_observed_standalone_module_node( - node, self.modules) + is_standalone_module_node = ( + node.op == 'call_module' and + is_observed_standalone_module( + self.modules[node.target]) # type: ignore + ) result = obj.convert( self, node, load_arg, debug=debug, convert_custom_config_dict=convert_custom_config_dict) diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index a07cbc6ef8e4..c1f849803342 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -2,6 +2,15 @@ import torch from ..utils import is_per_tensor, is_per_channel +from torch.fx import GraphModule, map_arg + +from torch.fx.graph import ( + Graph, + Node, +) + +from typing import Callable, Optional, List, Dict, Any + # turn foo.bar -> ['foo', 'bar'] def _parent_name(target): r = target.rsplit('.', 1) @@ -169,3 +178,85 @@ def get_linear_prepack_op_for_dtype(dtype): return torch.ops.quantized.linear_prepack else: raise Exception("can't get linear prepack op for dtype:", dtype) + +# Returns a function that can get a new attribute name for module with given +# prefix, for example, +# >> get_new_observer_name = get_new_attr_name_with_prefix('_observer') +# >> new_name = get_new_observer_name(module) +# new_name will be an unused attribute name on module, e.g. `_observer_1` +def get_new_attr_name_with_prefix(prefix: str) -> Callable: + def get_new_attr_name(module: torch.nn.Module): + def get_attr_name(i: int): + return prefix + str(i) + i = 0 + attr_name = get_attr_name(i) + while hasattr(module, attr_name): + i += 1 + attr_name = get_attr_name(i) + return attr_name + return get_new_attr_name + +def collect_producer_nodes(node: Node) -> Optional[List[Node]]: + r''' Starting from a target node, trace back until we hit inpu or + getattr node. This is used to extract the chain of operators + starting from getattr to the target node, for example + def forward(self, x): + observed = self.observer(self.weight) + return F.linear(x, observed) + collect_producer_nodes(observed) will either return a list of nodes that + produces the observed node or None if we can't extract a self contained + graph without free variables(inputs of the forward function). + ''' + nodes = [node] + frontier = [node] + while frontier: + node = frontier.pop() + all_args = list(node.args) + list(node.kwargs.values()) + for arg in all_args: + if not isinstance(arg, Node): + continue + if arg.op == 'placeholder': + # hit input, can't fold in this case + return None + nodes.append(arg) + if not (arg.op == 'call_function' and arg.target == getattr): + frontier.append(arg) + return nodes + +def graph_module_from_producer_nodes( + root: GraphModule, producer_nodes: List[Node]) -> GraphModule: + r''' Construct a graph module from extracted producer nodes + from `collect_producer_nodes` function + Args: + root: the root module for the original graph + producer_nodes: a list of nodes we use to construct the graph + Return: + A graph module constructed from the producer nodes + ''' + assert len(producer_nodes) > 0, 'list of producer nodes can not be empty' + # since we traced back from node to getattrr + producer_nodes.reverse() + graph = Graph() + env: Dict[Any, Any] = {} + + def load_arg(a): + return map_arg(a, lambda node: env[node]) + for producer_node in producer_nodes: + env[producer_node] = graph.node_copy(producer_node, load_arg) + graph.output(load_arg(producer_nodes[-1])) + graph_module = GraphModule(root, graph) + return graph_module + +def assert_and_get_unique_device(module: torch.nn.Module) -> Any: + """ + Returns the unique device for a module, or None if no device is found. + Throws an error if multiple devices are detected. + """ + devices = {p.device for p in module.parameters()} | \ + {p.device for p in module.buffers()} + assert len(devices) <= 1, ( + "prepare only works with cpu or single-device CUDA modules, " + "but got devices {}".format(devices) + ) + device = next(iter(devices)) if len(devices) > 0 else None + return device From 3f9ff48ebb069e3fbab8af99088c584475ef6aa3 Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Tue, 8 Dec 2020 15:34:19 -0800 Subject: [PATCH 055/250] [JIT] Allow del statements with multiple targets (#48876) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48876 **Summary** This commit adds support for `del` statements with multiple targets. Targets are deleted left-to-right just like Python. **Test Plan** This commit updates the `TestBuiltins.test_del_multiple_operands` unit test to actually test that multiple deletion works instead of asserting that an error is thrown. **Fixes** This commit fixes #48635. Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D25386285 Pulled By: SplitInfinity fbshipit-source-id: c0fbd8206cf98b2bd1b695d0b778589d58965a74 --- test/jit/test_builtins.py | 37 ++++++----- torch/csrc/jit/frontend/ir_emitter.cpp | 68 +++++++++++---------- torch/csrc/jit/frontend/parser.cpp | 7 ++- torch/csrc/jit/frontend/tree_views.h | 8 +-- torch/csrc/jit/python/python_tree_views.cpp | 7 ++- torch/jit/frontend.py | 9 +-- 6 files changed, 73 insertions(+), 63 deletions(-) diff --git a/test/jit/test_builtins.py b/test/jit/test_builtins.py index dafc95013b96..04991f72c352 100644 --- a/test/jit/test_builtins.py +++ b/test/jit/test_builtins.py @@ -109,22 +109,31 @@ def fn(x): return a def test_del_multiple_operands(self): + def fn(x): + # type: (List[int]) -> List[int] + a, b, c = x[0], x[1], x[2] + del a, b, c + return x - with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, - "with more than one operand"): - @torch.jit.script - def del_list_multiple_operands(x): - # type: (List[int]) -> List[int] - del x[0], x[1] - return x + self.checkScript(fn, ([1, 2, 3],)) - with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, - "with more than one operand"): - @torch.jit.script - def del_dict_multiple_operands(x): - # type: (Dict[str, int]) -> Dict[str, int] - del x['hi'], x['there'] - return x + def del_list_multiple_operands(x): + # type: (List[int]) -> List[int] + del x[0], x[1] + return x + + py_out = del_list_multiple_operands([0, 1, 2]) + jit_out = torch.jit.script(del_list_multiple_operands)([0, 1, 2]) + self.assertEquals(py_out, jit_out) + + def del_dict_multiple_operands(x): + # type: (Dict[str, int]) -> Dict[str, int] + del x['hi'], x['there'] + return x + + py_out = del_dict_multiple_operands({"hi": 5, "there": 6}) + jit_out = torch.jit.script(del_dict_multiple_operands)({"hi": 5, "there": 6}) + self.assertEquals(py_out, jit_out) class TestTensorBuiltins(JitTestCase): diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index 43cc24e8e29e..a21041343eee 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -945,43 +945,45 @@ struct to_ir { } void emitDelete(const Delete& stmt) { - if (stmt.expr().kind() == TK_SUBSCRIPT) { - Subscript subscript(stmt.expr()); - const List& subscript_exprs = subscript.subscript_exprs(); - if (subscript_exprs[0].kind() == TK_SLICE_EXPR) { - throw ErrorReport(stmt.range()) - << "del statements only support deletion at a single index, " - "slicing is not supported" - " (see https://github.com/pytorch/pytorch/issues/31430)"; - } - const SugaredValuePtr sv = emitSugaredExpr(subscript.value(), 1); - const SourceRange& val_range = subscript.value().range(); - Value* idx = emitExpr(subscript_exprs[0]); - Value* val = sv->asValue(val_range, method); - - // If val is a class instance, this is a method call to a type-specific - // implementation of del defined in a __delitem__ method. - if (auto cls = val->type()->cast()) { - if (!cls->findMethod("__delitem__")) { - throw ErrorReport(stmt.range()) - << "Class does not define __delitem__"; + for (const auto& target : stmt.targets()) { + if (target.kind() == TK_SUBSCRIPT) { + Subscript subscript(target); + const List& subscript_exprs = subscript.subscript_exprs(); + if (subscript_exprs[0].kind() == TK_SLICE_EXPR) { + throw ErrorReport(target.range()) + << "del statements only support deletion at a single index, " + "slicing is not supported" + " (see https://github.com/pytorch/pytorch/issues/31430)"; } + const SugaredValuePtr sv = emitSugaredExpr(subscript.value(), 1); + const SourceRange& val_range = subscript.value().range(); + Value* idx = emitExpr(subscript_exprs[0]); + Value* val = sv->asValue(val_range, method); + + // If val is a class instance, this is a method call to a type-specific + // implementation of del defined in a __delitem__ method. + if (auto cls = val->type()->cast()) { + if (!cls->findMethod("__delitem__")) { + throw ErrorReport(target.range()) + << "Class does not define __delitem__"; + } - // Use MethodValue to call the method to handle recursion. - MethodValue(val, "__delitem__") - .call(stmt.range(), method, {idx}, {}, 0); + // Use MethodValue to call the method to handle recursion. + MethodValue(val, "__delitem__") + .call(stmt.range(), method, {idx}, {}, 0); + } else { + auto node = graph->create(aten::Delete, {val, idx}, 0) + ->setSourceRange(target.range()); + graph->insertNode(node); + } + } else if (target.kind() == TK_VAR) { + Var var(target); + environment_stack->removeVar(var.name(), /*check_if_removed=*/true); } else { - auto node = graph->create(aten::Delete, {val, idx}, 0) - ->setSourceRange(stmt.range()); - graph->insertNode(node); + throw ErrorReport(target.range()) + << "del statements are only supported for deleting" + " list and dict items and variables"; } - } else if (stmt.expr().kind() == TK_VAR) { - Var var(stmt.expr()); - environment_stack->removeVar(var.name(), /*check_if_removed=*/true); - } else { - throw ErrorReport(stmt.range()) - << "del statements are only supported for deleting" - " list and dict items and variables"; } } diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp index 1f5e43fff149..c079e99893a7 100644 --- a/torch/csrc/jit/frontend/parser.cpp +++ b/torch/csrc/jit/frontend/parser.cpp @@ -558,10 +558,11 @@ struct ParserImpl { return parseFunction(/*is_method=*/in_class); } case TK_DELETE: { - L.expect(TK_DELETE); - auto expr = parseExp(); + auto range = L.next().range; + auto targets = + parseList(TK_NOTHING, ',', TK_NOTHING, &ParserImpl::parseExp); L.expect(TK_NEWLINE); - return Delete::create(expr); + return Delete::create(range, targets); } case TK_WITH: { return parseWith(); diff --git a/torch/csrc/jit/frontend/tree_views.h b/torch/csrc/jit/frontend/tree_views.h index e33d93f37566..389ed6d003db 100644 --- a/torch/csrc/jit/frontend/tree_views.h +++ b/torch/csrc/jit/frontend/tree_views.h @@ -1120,11 +1120,11 @@ struct Delete : public Stmt { explicit Delete(const TreeRef& tree) : Stmt(tree) { tree_->match(TK_DELETE); } - Expr expr() const { - return Expr(subtree(0)); + List targets() const { + return subtree(0); } - static Delete create(const Expr& value) { - return Delete(Compound::create(TK_DELETE, value.range(), {value})); + static Delete create(const SourceRange& range, const List& targets) { + return Delete(Compound::create(TK_DELETE, range, {targets})); } }; diff --git a/torch/csrc/jit/python/python_tree_views.cpp b/torch/csrc/jit/python/python_tree_views.cpp index 1355352c8278..1e622bda379a 100644 --- a/torch/csrc/jit/python/python_tree_views.cpp +++ b/torch/csrc/jit/python/python_tree_views.cpp @@ -200,9 +200,10 @@ void initTreeViewBindings(PyObject* module) { r, wrap_list(r, std::move(params)), wrap_maybe(r, return_type)); })); - py::class_(m, "Delete").def(py::init([](const Expr& expr) { - return Delete::create(expr); - })); + py::class_(m, "Delete") + .def(py::init([](const SourceRange& range, std::vector targets) { + return Delete::create(range, wrap_list(range, std::move(targets))); + })); py::class_(m, "WithItem") .def(py::init([](const SourceRange& range, const Expr& target, Var* var) { diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 78c226ab1739..57a66dde3a4c 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -417,12 +417,9 @@ def build_AnnAssign(ctx, stmt): @staticmethod def build_Delete(ctx, stmt): - if len(stmt.targets) > 1: - source_range = ctx.make_range(stmt.lineno, stmt.col_offset, - stmt.col_offset + len("del")) - raise NotSupportedError( - source_range, 'del with more than one operand is not supported') - return Delete(build_expr(ctx, stmt.targets[0])) + r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("del")) + + return Delete(r, [build_expr(ctx, target) for target in stmt.targets]) @staticmethod def build_Return(ctx, stmt): From 107c31f2f5dacc1cafc45c6aac113b5dcb38f698 Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Tue, 8 Dec 2020 18:04:07 -0800 Subject: [PATCH 056/250] Add a pass to fetch attributes of nn.Module to fx.node (#47935) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47935 Fetch the parameters that are needed for lowering from nn.Module to fx.node for leaf_modules. Test Plan: A test `test_fetch` is added to test_fx_experimental.py. Reviewed By: jfix71 Differential Revision: D24957142 fbshipit-source-id: a349bb718bbcb7f543a49f235e071a079da638b7 --- test/test_fx_experimental.py | 38 ++++++++++++- torch/fx/experimental/graph_manipulation.py | 24 ++++----- torch/fx/experimental/param_fetch.py | 60 +++++++++++++++++++++ 3 files changed, 108 insertions(+), 14 deletions(-) create mode 100644 torch/fx/experimental/param_fetch.py diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index 0ecb44be6204..6e9c877b8de6 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -1,13 +1,14 @@ import torch import unittest import sys -from typing import Dict +from typing import Callable, Dict, Union, List from torch.fx.symbolic_trace import symbolic_trace from torch.fx.graph_module import GraphModule from torch.fx.node import Node from torch.fx.experimental import graph_manipulation from torch.fx.experimental.accelerator_partitioner import Partitioner from torch.fx.experimental.rewriter import RewritingTracer +from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes from torch.testing._internal.common_utils import run_tests from torch.testing._internal.jit_utils import JitTestCase from torch.fx.experimental.subgraph_creation_example import split_module @@ -20,7 +21,6 @@ PartitionMode ) from torch.fx.experimental.fuser import fuse -from typing import Union, Callable try: from torchvision.models import resnet18 @@ -809,6 +809,40 @@ def forward(self, x): t = torch.randn(2, 2) self.assertEqual(module.Foo()(t), mod(t)) + def test_fetch(self): + attrs_for_lowering: Dict[str, List[str]] = { + "torch.nn.modules.conv.Conv2d": [ + "weight", "bias", "kernel_size", "stride", "padding", "dilation", "groups", "padding_mode" + ], + "torch.nn.modules.batchnorm.BatchNorm2d": [ + "weight", "bias", "running_mean", "running_var", "eps" + ], + } + + class TestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = torch.nn.Conv2d(3, 3, 2) + self.bn = torch.nn.BatchNorm2d(3) + + def forward(self, a): + a = self.conv(a) + a += a + return self.bn(a) + + mod = TestModule() + traced = symbolic_trace(mod) + lift_lowering_attrs_to_nodes(traced) + + for node in traced.graph.nodes: + if node.op == "call_module": + assert hasattr(node, "attrs_for_lowering") + para_list = attrs_for_lowering[node.attrs_for_lowering["name"]] + + # node.attrs_for_lowering has an addition field of class name + assert len(para_list) + 1 == len(node.attrs_for_lowering) + for p_name in para_list: + assert p_name in node.attrs_for_lowering if __name__ == "__main__": diff --git a/torch/fx/experimental/graph_manipulation.py b/torch/fx/experimental/graph_manipulation.py index 2eea162faedb..4e6c23cbad9f 100644 --- a/torch/fx/experimental/graph_manipulation.py +++ b/torch/fx/experimental/graph_manipulation.py @@ -3,6 +3,7 @@ import torch from torch.fx.experimental.shape_prop import ShapeProp +from torch.fx.experimental.param_fetch import lift_lowering_attrs_to_nodes from torch.fx.graph import Graph, get_qualified_name from torch.fx.graph_module import GraphModule from torch.fx.node import Node, Target, map_arg @@ -122,19 +123,17 @@ def serialize_weight(tensor: torch.Tensor) -> Dict: def serialize_leaf_module( - mod: torch.nn.Module, weights_metadata: Dict, weights: Dict, name_prefix: str + node: Node, weights_metadata: Dict, weights: Dict, name_prefix: str ) -> Dict: parameters: Dict[str, Any] = {} - parameters["name"] = type(mod).__name__ - for name, buffer in mod.named_buffers(): - weights_metadata[f"{name_prefix}.{name}"] = serialize_weight(buffer) - weights[f"{name_prefix}.{name}"] = buffer - for name, parameter in mod.named_parameters(): - weights_metadata[f"{name_prefix}.{name}"] = serialize_weight(parameter) - weights[f"{name_prefix}.{name}"] = parameter - if isinstance(mod.__constants__, List): - for constant in mod.__constants__: - parameters[constant] = str(getattr(mod, constant)) + + for p_name, p_value in node.attrs_for_lowering.items(): # type: ignore + if isinstance(p_value, torch.Tensor): + weights_metadata[f"{name_prefix}.{p_name}"] = serialize_weight(p_value) + weights[f"{name_prefix}.{p_name}"] = p_value + else: + parameters[p_name] = str(p_value) + return parameters @@ -187,6 +186,7 @@ def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> D weight = serialize_weight(p) serialized_dict["weights"][prefix + name] = weight weights[prefix + name] = p + lift_lowering_attrs_to_nodes(fx_module) for node in fx_module.graph.nodes: node_rep: Dict[str, Any] = {} # Get shape/type info, currently not needed for call_module. @@ -217,7 +217,7 @@ def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> D serialized_dict["modules"][node.target] = serialized_module else: node_rep["parameters"] = serialize_leaf_module( - submodules[node.target], + node, serialized_dict["weights"], weights, prefix + node.target, diff --git a/torch/fx/experimental/param_fetch.py b/torch/fx/experimental/param_fetch.py new file mode 100644 index 000000000000..6bce29b97e78 --- /dev/null +++ b/torch/fx/experimental/param_fetch.py @@ -0,0 +1,60 @@ +from torch.fx.graph_module import GraphModule +from typing import Any, Callable, Dict, List, Tuple, Type +import torch +import torch.nn as nn + + +# Matching method matches the attribute name of current version to the attribute name of `target_version` +def default_matching(name: str, target_version: int) -> str: + """Default matching method + """ + return name + +# This dict maps the nn.Module class name to the attribute name list that we want to fetch for lowering. +# The first integer in the tuple is the version number of the nn.Module class when we create the parameter list. +# If there's a version mismatch then it means the parameter names in the book might be mismatched with nn.Module. +module_fetch_book: Dict[Type, Tuple[int, List[str], Callable[[str, int], str]]] = { + torch.nn.modules.linear.Linear: (1, ["weight", "bias"], default_matching), + torch.nn.modules.conv.Conv2d: ( + 1, ["weight", "bias", "kernel_size", "stride", "padding", "dilation", "groups", "padding_mode"], default_matching + ), + torch.nn.modules.batchnorm.BatchNorm2d: (2, ["weight", "bias", "running_mean", "running_var", "eps"], default_matching), + torch.nn.modules.pooling.AdaptiveAvgPool2d: (1, [], default_matching), + torch.nn.modules.pooling.MaxPool2d: ( + 1, ["kernel_size", "stride", "padding", "dilation", "return_indices", "ceil_mode"], default_matching + ), + torch.nn.modules.activation.ReLU: (1, ["inplace"], default_matching), +} + +def extract_attrs_for_lowering(mod: nn.Module) -> Dict[str, Any]: + """If `mod` is in `module_fetch_book`, fetch the mod's attributes that in the `module_fetch_book` + after checking module's version is compatible with the `module_fetch_book`. + """ + attrs_for_lowering: Dict[str, Any] = {} + attrs_for_lowering["name"] = torch.typename(mod) + + if type(mod) in module_fetch_book: + version, param_to_fetch, matching_method = module_fetch_book[type(mod)] + if version < mod._version: + raise RuntimeError(f"Fetcher version {version} try to fetch {torch.typename(mod)} version {mod._version}, " + "please upgrade the module_fetch_book, open an issue and @842974287 " + "or report a bug to AIACC team directly.") + for attr in param_to_fetch: + attrs_for_lowering[attr] = getattr(mod, matching_method(attr, mod._version)) + else: + raise RuntimeError(f"{torch.typename(mod)} is not in the module_fetch_book yet, " + "please add it to the module_fetch_book, open an issue and @842974287 " + "or report a bug to AIACC team directly.") + return attrs_for_lowering + +def lift_lowering_attrs_to_nodes(fx_module: GraphModule) -> None: + """Recursively traverse all `fx_module` nodes and fetch the module's attributes if the node is a leaf module. + """ + submodules = dict(fx_module.named_modules()) + + for node in fx_module.graph.nodes: + if node.op == "call_module": + if isinstance(submodules[node.target], GraphModule): + lift_lowering_attrs_to_nodes(submodules[node.target]) + else: + node.attrs_for_lowering = extract_attrs_for_lowering(submodules[node.target]) From 993ce4b2069702c8374d0344a76e7a326804ea2a Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 8 Dec 2020 18:51:58 -0800 Subject: [PATCH 057/250] [quant][graphmode][fx] Add MatchAllNode in pattern matching (#48979) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48979 Test Plan: Imported from OSS Reviewed By: vkuzo Differential Revision: D25385459 fbshipit-source-id: 43adffc9e2242d099cecd38d1902f9900158f51e --- test/quantization/test_quantize_fx.py | 30 ++++++++++++++++++++++++++ torch/quantization/fx/pattern_utils.py | 9 ++++++++ 2 files changed, 39 insertions(+) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 7e4048b98cbf..11ceb920ea8c 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -14,6 +14,11 @@ prepare_qat_fx, ) +from torch.quantization.fx.pattern_utils import ( + is_match, + MatchAllNode, +) + from torch.quantization import ( QuantType, QuantStub, @@ -186,6 +191,31 @@ def forward(self, x): @skipIfNoFBGEMM class TestQuantizeFx(QuantizationTestCase): + def test_pattern_match(self): + """ test MatchAllNode with + conv - bn - add - relu pattern + """ + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(1, 1, 1) + self.bn = nn.BatchNorm2d(1) + self.relu = nn.ReLU() + + def forward(self, x, y): + x = self.conv(x) + x = self.bn(x) + x = x + y + x = self.relu(x) + return x + + pattern = (nn.ReLU, (operator.add, (nn.BatchNorm2d, nn.Conv2d), MatchAllNode)) + m = torch.fx.symbolic_trace(M()) + modules = dict(m.named_modules()) + for n in m.graph.nodes: + if n.op == 'call_module' and type(modules[n.target]) == nn.ReLU: + self.assertTrue(is_match(modules, n, pattern)) + def _get_conv_linear_test_cases(self): ''' Returns a list of test cases, with format: is_dynamic, ModuleClass, module_constructor_inputs, diff --git a/torch/quantization/fx/pattern_utils.py b/torch/quantization/fx/pattern_utils.py index 146dad1eab2e..fe13d0a3fed7 100644 --- a/torch/quantization/fx/pattern_utils.py +++ b/torch/quantization/fx/pattern_utils.py @@ -56,6 +56,12 @@ def insert(fn): def input_output_observed(qh): return type(qh) not in DEFAULT_NOT_OBSERVED_QUANTIZE_HANDLER + +class MatchAllNode: + """ A node pattern that matches all nodes + """ + pass + # Example use of register pattern function: # @register_fusion_pattern(torch.nn.ReLU, (torch.nn.BatchNorm2d, torch.nn.Conv2d))) # class ConvBNReLUFusion(): @@ -79,6 +85,9 @@ def is_match(modules, node, pattern, max_uses=sys.maxsize): self_match = pattern arg_matches = [] + if isinstance(self_match, type) and issubclass(self_match, MatchAllNode): + return True + if len(node.users) > max_uses: return False From 4434c07a2c0ba4debc6330063546f600aee8deb3 Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Tue, 8 Dec 2020 19:20:01 -0800 Subject: [PATCH 058/250] [quant][fix] Support quantization of ops where input is quantizable (#49027) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49027 For cat followed by linear since the output of cat is not quanitzed, we didnt quantize the linear This checks the uses of the cat op to insert observers Test Plan: python test/test_quantization.py TestQuantizeJitOps.test_cat_linear Imported from OSS Reviewed By: jerryzh168 Differential Revision: D25403412 fbshipit-source-id: 5875db259bf75f08ce672ce341a67005ed2f8a04 --- test/quantization/test_quantize_jit.py | 23 +++++++++++++++++++ .../passes/quantization/insert_observers.cpp | 15 ++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py index f67a585d99b6..d8022befeed0 100644 --- a/test/quantization/test_quantize_jit.py +++ b/test/quantization/test_quantize_jit.py @@ -81,6 +81,7 @@ class TestQuantizeJitPasses(QuantizationTestCase): """ Test graph mode quantization passes used by quantize_jit """ + def test_foldbn_trivial(self): bn_module = {2 : torch.nn.BatchNorm2d, 3 : torch.nn.BatchNorm3d} conv_module = {2 : torch.nn.Conv2d, 3 : torch.nn.Conv3d} @@ -2708,6 +2709,28 @@ def test_conv_with_benchmark_flag(self): FileCheck().check("quantized::conv2d") \ .run(converted_model.graph) + @skipIfNoFBGEMM + def test_cat_linear(self): + class LinearModel(torch.nn.Module): + def __init__(self): + super(LinearModel, self).__init__() + self.weight = torch.randn(5, 5) + + def forward(self, x, y): + a = torch.cat([x, y]) + b = F.linear(a, self.weight) + c = F.linear(b, self.weight) + return b, c + + model = LinearModel().eval() + qconfig = {'' : default_qconfig} + float_model = torch.jit.script(model) + prepared_model = prepare_jit(float_model, qconfig) + prepared_model(torch.rand(5, 5), torch.rand(5, 5)) + converted_model = convert_jit(prepared_model) + FileCheck().check("quantized::linear") \ + .check("quantized::linear") \ + .run(converted_model.graph) class TestQuantizeDynamicJitPasses(QuantizationTestCase): def test_prepare_dynamic(self): diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp index 1b93d28e2e1a..bacd8cf29bd2 100644 --- a/torch/csrc/jit/passes/quantization/insert_observers.cpp +++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp @@ -394,7 +394,17 @@ class InsertObserversHelper { // are observed bool shouldObserve( Node* n, - const std::unordered_set& block_observed_values) { + const std::unordered_set& block_observed_values, + QuantType quant_type) { + // Check whether node output uses can be quantized, eg cat followed by + // linear op + for (Value* v : n->outputs()) { + for (const auto& use : v->uses()) { + if (useQuantizable(use, quant_type)) { + return true; + } + } + } if (isPropagateQuantSingleInputOp(n)) { return isObserved(n->input(0), block_observed_values); } else if (isPropagateQuantBinaryOp(n)) { @@ -1528,7 +1538,8 @@ InsertObserversHelper::insertObserversFor( // If the node is one of the propagate quant node, e.g. // aten::cat, we should observe its output only // if the input of the node is observed - if (observer_opt && shouldObserve(n, block_observed_values)) { + if (observer_opt && + shouldObserve(n, block_observed_values, quant_type_)) { recordObserved( v, *observer_opt, values_to_observe, block_observed_values); } From 5450614cf6d9b588d9ab59e9ce39c520f5415677 Mon Sep 17 00:00:00 2001 From: peterjc123 Date: Tue, 8 Dec 2020 19:36:47 -0800 Subject: [PATCH 059/250] Correctly apply WIN32_LEAN_AND_MEAN to the whole repo (#49025) Summary: Fixes https://github.com/pytorch/pytorch/issues/48895 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49025 Reviewed By: zhangguanheng66 Differential Revision: D25399912 Pulled By: ezyang fbshipit-source-id: 9b7225b0e43511e0b8981c39035d814a4406c523 --- cmake/Dependencies.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 2f9ff160763b..6c3bed552533 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -46,6 +46,8 @@ endif() # 3. If MSVC_Z7_OVERRIDE is ON, then /Zi and /ZI will be replaced with /Z7 # for Debug and RelWithDebInfo builds if(MSVC) + # skip unwanted includes from windows.h + add_definitions(-DWIN32_LEAN_AND_MEAN) foreach(flag_var CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL) @@ -1513,8 +1515,6 @@ if(NOT INTERN_BUILD_MOBILE) if(MSVC) # we want to respect the standard, and we are bored of those **** . add_definitions(-D_CRT_SECURE_NO_DEPRECATE=1) - # skip unwanted includes from windows.h - add_definitions(-DWIN32_LEAN_AND_MEAN) list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=/wd4819,/wd4503,/wd4190,/wd4244,/wd4251,/wd4275,/wd4522") endif() From 34cc77a8116718ad22936e571d18b54b793bf7c0 Mon Sep 17 00:00:00 2001 From: Guilherme Leobas Date: Tue, 8 Dec 2020 19:40:01 -0800 Subject: [PATCH 060/250] Torch onnx (#48980) Summary: Fixes https://github.com/pytorch/pytorch/issues/45215 This is a follow up PR of https://github.com/pytorch/pytorch/issues/45258 and https://github.com/pytorch/pytorch/issues/48782 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48980 Reviewed By: zhangguanheng66 Differential Revision: D25399823 Pulled By: ezyang fbshipit-source-id: 798055f4abbbffecdfab0325884193c81addecec --- mypy.ini | 24 ----------- torch/_C/__init__.pyi.in | 72 ++++++++++++++++++++++++++++++++- torch/_C/_onnx.pyi | 1 + torch/onnx/symbolic_helper.py | 25 +++++++----- torch/onnx/symbolic_opset8.py | 2 +- torch/onnx/symbolic_opset9.py | 9 +++-- torch/onnx/symbolic_registry.py | 5 ++- torch/onnx/utils.py | 24 ++++++----- 8 files changed, 110 insertions(+), 52 deletions(-) diff --git a/mypy.ini b/mypy.ini index f4b37f15a820..0b9f5497162c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -143,30 +143,6 @@ ignore_errors = True [mypy-torch.nn.intrinsic.qat.modules.conv_fused] ignore_errors = True -[mypy-torch.onnx.operators] -ignore_errors = True - -[mypy-torch.onnx.symbolic_opset8] -ignore_errors = True - -[mypy-torch.onnx.symbolic_opset9] -ignore_errors = True - -[mypy-torch.onnx.symbolic_opset11] -ignore_errors = True - -[mypy-torch.onnx.symbolic_caffe2] -ignore_errors = True - -[mypy-torch.onnx.symbolic_helper] -ignore_errors = True - -[mypy-torch.onnx.symbolic_registry] -ignore_errors = True - -[mypy-torch.onnx.utils] -ignore_errors = True - [mypy-torch.multiprocessing.pool] ignore_errors = True diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index cbb5b2452e21..a7f1f1b91c93 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -165,7 +165,10 @@ def wait(fut: Future) -> Any: ... def _collect_all(futures: List[Future]) -> Future: ... def unify_type_list(types: List[JitType]) -> JitType: ... -def _freeze_module(module: ScriptModule, preserved_attrs: List[str], freeze_interfaces: _bool = True) -> ScriptModule: ... +def _freeze_module(module: ScriptModule, + preserved_attrs: List[str] = [], + freeze_interfaces: _bool = True, + preserveParameters: _bool = True) -> ScriptModule: ... def _is_tracing() -> _bool: ... def _jit_init() -> _bool: ... def _jit_flatten(arg: Any) -> Tuple[List[Tensor], IODescriptor]: ... @@ -217,6 +220,8 @@ def _jit_get_trigger_value(trigger_name: str) -> _int: ... # Defined in torch/csrc/jit/python/script_init.cpp ResolutionCallback = Callable[[str], Callable[..., Any]] +# Defined in torch/csrc/jit/python/script_init.cpp +# and torch/csrc/jit/python/init.cpp def _create_function_from_graph(qualname: str, graph: Graph) -> Graph: ... def _debug_set_autodiff_subgraph_inlining(disabled: _bool) -> None: ... def _ivalue_tags_match(lhs: ScriptModule, rhs: ScriptModule) -> _bool: ... @@ -246,6 +251,55 @@ def _resolve_type_from_object(obj: Any, range: SourceRange, rcb: ResolutionCallb def _create_module_with_type(ty: JitType) -> ScriptModule: ... def _run_emit_module_hook(m: ScriptModule): ... def _replace_overloaded_method_decl(overload_decl: Decl, implementation_def: Def, new_name: str) -> Def: ... + +def _jit_pass_lower_all_tuples(graph: Graph) -> None: ... +def _jit_pass_onnx_set_dynamic_input_shape(graph: Graph, dynamic_axes: Dict[str, Dict[_int, str]], input_names: List[str]) -> None: ... +def _jit_pass_onnx_graph_shape_type_inference(graph: Graph, opset_version: _int) -> None: ... +def _jit_pass_onnx_assign_output_shape(graph: Graph, tensors: List[Tensor], onnx_shape_inference: _bool = False) -> None: ... +def _jit_pass_fixup_onnx_loop_node_inputs(n: Node) -> None: ... +def _jit_pass_onnx_remove_inplace_ops_for_onnx(graph: Graph) -> None: ... +def _jit_pass_remove_inplace_ops(graph: Graph) -> None: ... +def _jit_pass_canonicalize_graph_fuser_ops(graph: Graph) -> None: ... +def _jit_pass_peephole(graph: Graph, addmm_fusion_enabled: _bool) -> None: ... +def _jit_pass_fuse_addmm(graph: Graph) -> None: ... +def _jit_pass_onnx_preprocess(graph: Graph) -> None: ... +def _jit_pass_onnx_prepare_inplace_ops_for_onnx(graph: Graph) -> None: ... +def _jit_pass_prepare_division_for_onnx(graph: Graph) -> None: ... +def _jit_pass_onnx_remove_print(graph: Graph) -> None: ... +def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ... +def _jit_pass_onnx_unpack_quantized_weights( + graph: Graph, + paramsDict: Dict[str, IValue] +) -> Dict[str, IValue]: ... +def _jit_pass_onnx_quantization_insert_permutes( + graph: Graph, + paramsDict: Dict[str, IValue] +) -> Dict[str, IValue]: ... +def _jit_pass_custom_pattern_based_rewrite_graph(pattern: str, fused_node_name: str, graph: Graph) -> None: ... +def _jit_onnx_list_model_parameters(module: ScriptModule) -> Tuple[ScriptModule, List[IValue]]: ... +def _jit_pass_erase_number_types(graph: Graph) -> None: ... +def _jit_pass_onnx(graph: Graph, _jit_pass_onnx: _onnx.OperatorExportTypes) -> Graph: ... +def _jit_pass_onnx_scalar_type_analysis(graph: Graph) -> None: ... +def _jit_pass_onnx_peephole(graph: Graph, opset_version: _int, fixed_batch_size: _bool) -> None: ... +def _jit_pass_dce_allow_deleting_nodes_with_side_effects(graph: Graph) -> None: ... +def _jit_pass_onnx_function_substitution(graph: Graph) -> None: ... +def _jit_pass_lower_graph(graph: Graph, m: Module) -> Tuple[Graph, List[IValue]]: ... +def _jit_pass_inline_fork_wait(graph: Graph) -> None: ... +def _jit_pass_onnx_eval_peephole(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... +def _jit_pass_onnx_constant_fold(graph: Graph, paramsDict: Dict[str, IValue], opset_version: _int) -> Dict[str, IValue]: ... +def _jit_pass_onnx_eliminate_unused_items(graph: Graph, paramsDict: Dict[str, IValue]) -> Dict[str, IValue]: ... +def _jit_pass_onnx_cast_all_constant_to_floating(graph: Graph) -> None: ... +def _jit_pass_filter_non_tensor_arguments(params: Dict[str, IValue]) -> Dict[str, Tensor]: ... +def _jit_decay_packed_param_input_types(graph: Graph) -> None: ... +def _jit_pass_onnx_node_shape_type_inference(n: Node, opset_version: _int) -> None: ... +def _jit_pass_onnx_block( + old_block: Block, + new_block: Block, + operator_export_type: _onnx.OperatorExportTypes, + env: Dict[Value, Value] +) -> None: ... +def _jit_pass_fixup_onnx_controlflow_node(n: Node, opset_version: _int) -> Node: ... + def _jit_script_interface_compile(name: str, class_def: ClassDef, rcb: ResolutionCallback, is_module: _bool): ... def _jit_script_compile_overload( qualname: str, @@ -281,8 +335,18 @@ def import_ir_module_from_buffer( extra_files: Dict[str, Any] ) -> ScriptModule: ... +def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ... +def _check_onnx_proto(proto: str) -> None: ... +def _propagate_and_assign_input_shapes( + graph: Graph, + inputs: Tuple[Tensor, ...], + with_grad: _bool, + propagate: _bool +) -> Graph: ... + # Defined in torch/torch/csrc/jit/ir/ir.h class Graph: + def eraseInput(self, i: _int) -> None: ... ... # Defined in torch/csrc/jit/ir/ir.h @@ -366,8 +430,8 @@ class ScriptFunction: def qualified_name(self) -> str: ... class ScriptMethod: + graph: Graph ... - class ModuleDict: def __init__(self, mod: ScriptModule) -> None: ... def items(self) -> List[Tuple[str, Any]]: ... @@ -378,6 +442,10 @@ class ParameterDict: class BufferDict: def __init__(self, mod: ScriptModule) -> None: ... +# Defined in torch/csrc/jit/api/module.h +class Module: + ... + # Defined in torch/csrc/Module.cpp def _initExtension(shm_manager_path: str) -> None: ... # THPModule_initExtension def _autograd_init() -> _bool: ... # THPAutograd_initExtension diff --git a/torch/_C/_onnx.pyi b/torch/_C/_onnx.pyi index 51f16566ce6c..7ab3cd9c567d 100644 --- a/torch/_C/_onnx.pyi +++ b/torch/_C/_onnx.pyi @@ -29,6 +29,7 @@ class OperatorExportTypes(Enum): ONNX_ATEN = ... ONNX_ATEN_FALLBACK = ... RAW = ... + ONNX_FALLTHROUGH = ... class TrainingMode(Enum): EVAL = ... diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 5e9430f995f8..10250baf131a 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -2,6 +2,7 @@ import torch import warnings from sys import maxsize as maxsize +from typing import Set import torch.onnx # This import monkey-patches graph manipulation methods on Graph, used for the @@ -125,7 +126,7 @@ def decorator(fn): def wrapper(g, *args, **kwargs): # some args may be optional, so the length may be smaller assert len(arg_descriptors) >= len(args) - args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)] + args = [_parse_arg(arg, arg_desc) for arg, arg_desc in zip(args, arg_descriptors)] # type: ignore # only support _outputs in kwargs assert len(kwargs) <= 1 if len(kwargs) == 1: @@ -232,18 +233,18 @@ def _select_helper(g, self, dim, index, apply_reshape=True): def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False): if _export_onnx_opset_version <= 9: - from torch.onnx.symbolic_opset9 import _slice - return _slice(g, input, axes, starts, ends) + from torch.onnx.symbolic_opset9 import _slice as _slice9 + return _slice9(g, input, axes, starts, ends) else: - from torch.onnx.symbolic_opset10 import _slice - return _slice(g, input, axes, starts, ends, steps, dynamic_slice) + from torch.onnx.symbolic_opset10 import _slice as _slice10 + return _slice10(g, input, axes, starts, ends, steps, dynamic_slice) def _hardtanh_helper(g, input, min_val, max_val): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import hardtanh return hardtanh(g, input, min_val, max_val) else: - from torch.onnx.symbolic_opset11 import hardtanh + from torch.onnx.symbolic_opset11 import hardtanh # type: ignore[no-redef] return hardtanh(g, input, min_val, max_val) def _is_fp(value): @@ -380,7 +381,7 @@ def _interpolate_get_scales_and_mode(g, input, size, scale_factor, mode , align_ size = g.op("Concat", *size, axis_i=0) scale_factor = _interpolate_size_to_scales(g, input, size, dim) else: - return _unimplemented("Both size and scales are None in __interpolate") + return _unimplemented("interpolate", "Both size and scales are None in __interpolate") return scale_factor, mode @@ -388,7 +389,7 @@ def _unbind_helper(g, self, dim, _outputs): if _export_onnx_opset_version <= 9: from torch.onnx.symbolic_opset9 import unbind else: - from torch.onnx.symbolic_opset11 import unbind + from torch.onnx.symbolic_opset11 import unbind # type: ignore[no-redef] return unbind(g, self, dim, _outputs) @@ -396,7 +397,8 @@ def _scatter_helper(g, self, dim, index, src): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import scatter else: - from torch.onnx.symbolic_opset11 import scatter + # for mypy, scatter was imported two lines above + from torch.onnx.symbolic_opset11 import scatter # type: ignore return scatter(g, self, dim, index, src) @@ -444,7 +446,8 @@ def _index_fill_reshape_helper(g, self, dim, index): if _export_onnx_opset_version <= 10: from torch.onnx.symbolic_opset9 import scatter else: - from torch.onnx.symbolic_opset11 import scatter + # for mypy, scatter was imported two lines above + from torch.onnx.symbolic_opset11 import scatter # type: ignore if self.type().dim() is None: return _unimplemented("index_fill", "input rank not accesible") @@ -632,4 +635,4 @@ def _cast_func_template(to_i, g, input, non_blocking): # Global set to store the list of quantized operators in the network. # This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX. -_quantized_ops = set() +_quantized_ops: Set[int] = set() diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py index c0c1d48ebec0..e4023dab2320 100644 --- a/torch/onnx/symbolic_opset8.py +++ b/torch/onnx/symbolic_opset8.py @@ -4,7 +4,7 @@ import torch.onnx.symbolic_opset9 as sym_opset9 from torch.onnx.symbolic_helper import parse_args, _unimplemented, _block_list_in_opset, _try_get_scalar_type -from torch.onnx.symbolic_opset9 import _cast_Float +from torch.onnx.symbolic_opset9 import _cast_Float # type: ignore import warnings diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index e395ce5c703f..8630f48a62ad 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -13,6 +13,8 @@ import torch.onnx.symbolic_helper as sym_help from torch.onnx.symbolic_helper import parse_args, _parse_arg, _unimplemented +from typing import Optional + import numpy import math import warnings @@ -311,7 +313,7 @@ def _maybe_cast_reduce_op_input(g, self): if dtype is not None: # pytorch reduce-ops cast all other integral types to int64 if not sym_help._is_fp(self) and not (dtype == 'Long'): - self = _cast_Long(g, self, False) + self = _cast_Long(g, self, False) # type: ignore return self @@ -2092,7 +2094,7 @@ def _pack_padded_sequence(g, input, lengths, batch_first): # It's really only necessary because those operators expand to something that # only works with int32 types in Caffe2... if lengths.type().scalarType() != 'Int': - lengths = _cast_Int(g, lengths, False) + lengths = _cast_Int(g, lengths, False) # type: ignore return g.op("prim::PackPadded", input, lengths, outputs=2) @@ -2436,7 +2438,7 @@ def _get_arange_dtype(dtype): def masked_fill(g, self, mask, value): - mask = _cast_Bool(g, mask, False) + mask = _cast_Bool(g, mask, False) # type: ignore value = sym_help._maybe_get_scalar(value) return g.op('Where', mask, sym_help._if_scalar_type_as(g, value, self), self) @@ -2734,6 +2736,7 @@ def as_strided(g, self, sizes, strides, offset=None): sizes = sym_help._maybe_get_const(sizes, 'is') rank = len(strides) self_1d = g.op("Reshape", self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + ind: Optional[torch.Tensor] if not sym_help._is_value(sizes): ind = torch.tensor([0], dtype=torch.long) for i, (size, stride) in enumerate(zip(sizes, strides)): diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py index 48114d6c472b..c059e8f2eb31 100644 --- a/torch/onnx/symbolic_registry.py +++ b/torch/onnx/symbolic_registry.py @@ -1,6 +1,7 @@ import warnings import importlib from inspect import getmembers, isfunction +from typing import Dict, Tuple, Any, Union # The symbolic registry "_registry" is a dictionary that maps operators # (for a specific domain and opset version) to their symbolic functions. @@ -8,9 +9,9 @@ # The keys are tuples (domain, version), (where domain is a string, and version is an int), # and the operator's name (string). # The map's entries are as follows : _registry[(domain, version)][op_name] = op_symbolic -_registry = {} +_registry: Dict[Tuple[str, int], Dict] = {} -_symbolic_versions = {} +_symbolic_versions: Dict[Union[int, str], Any] = {} from torch.onnx.symbolic_helper import _onnx_stable_opsets for opset_version in _onnx_stable_opsets: module = importlib.import_module('torch.onnx.symbolic_opset{}'.format(opset_version)) diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 5c41306b9ee2..3fe19a56c124 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -18,6 +18,7 @@ from torch.jit import _unique_state_dict from torch.onnx import ONNX_ARCHIVE_MODEL_PROTO_NAME, ExportTypes, OperatorExportTypes, TrainingMode from torch._C import ListType, OptionalType, _propagate_and_assign_input_shapes, _check_onnx_proto +from typing import Union, Tuple, List # the flag to tell the user whether it's in the middle of ONNX export or not @@ -76,7 +77,7 @@ def export(model, args, f, export_params=True, verbose=False, training=None, if aten or export_raw_ir: assert operator_export_type is None assert aten ^ export_raw_ir - operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW + operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW elif operator_export_type is None: if torch.onnx.PYTORCH_ONNX_CAFFE2_BUNDLE: operator_export_type = OperatorExportTypes.ONNX_ATEN_FALLBACK @@ -351,6 +352,7 @@ def _trace_and_get_graph_from_model(model, args): def _create_jit_graph(model, args, _retain_param_name, use_new_jit_passes): torch_out = None + params: Union[List, Tuple] if isinstance(model, torch.jit.ScriptModule): try: graph = model.forward.graph @@ -442,7 +444,7 @@ def _model_to_graph(model, args, verbose=False, param_names = input_and_param_names[len(input_and_param_names) - len(params):] params_dict = dict(zip(param_names, params)) - if training is None or training == TrainingMode.EVAL or (training == TrainingMode.PRESERVE and not is_originally_training): + if training is None or training == TrainingMode.EVAL: params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict) if do_constant_folding and _export_onnx_opset_version in torch.onnx.constant_folding_opset_versions: @@ -476,7 +478,7 @@ def export_to_pretty_string(model, args, f, export_params=True, verbose=False, t if aten or export_raw_ir: assert operator_export_type is None assert aten ^ export_raw_ir - operator_export_type = OperatorExportTypes.ATEN if aten else OperatorExportTypes.RAW + operator_export_type = OperatorExportTypes.ONNX_ATEN if aten else OperatorExportTypes.RAW elif operator_export_type is None: operator_export_type = OperatorExportTypes.ONNX return _export_to_pretty_string(model, args, f, export_params, verbose, training, @@ -1051,6 +1053,10 @@ def _graph_constant(g, value, dims, type, *args, **kwargs): dims = [1] isscalar = True type = type.lower() + tensor: Union[torch.CharTensor, torch.ShortTensor, + torch.IntTensor, torch.LongTensor, + torch.HalfTensor, torch.FloatTensor, + torch.DoubleTensor] if type == "char": tensor = torch.CharTensor(*dims) elif type == "short": @@ -1068,7 +1074,7 @@ def _graph_constant(g, value, dims, type, *args, **kwargs): else: raise ValueError("Unknown type, type should be one of the following strings: " "char, short, int, long, half, float, double") - tensor.fill_(value) + tensor.fill_(value) # type: ignore if isscalar: return g.op("Constant", *args, value_z=tensor, **kwargs) return g.op("Constant", *args, value_t=tensor, **kwargs) @@ -1141,8 +1147,8 @@ def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names): dynamic_axes[key] = value_dict -torch._C.Graph.op = _graph_op -torch._C.Graph.at = _graph_at -torch._C.Block.op = _block_op -torch._C.Graph.constant = _graph_constant -torch._C.Node.__getitem__ = _node_getitem +torch._C.Graph.op = _graph_op # type: ignore +torch._C.Graph.at = _graph_at # type: ignore +torch._C.Block.op = _block_op # type: ignore +torch._C.Graph.constant = _graph_constant # type: ignore +torch._C.Node.__getitem__ = _node_getitem # type: ignore From 7c0a3e3a06c1addd26fbd7270a0ce8cdd5e66e6a Mon Sep 17 00:00:00 2001 From: Guilherme Leobas Date: Tue, 8 Dec 2020 20:05:02 -0800 Subject: [PATCH 061/250] Annotate torch._tensor_str (#48584) Summary: This is a follow up PR of https://github.com/pytorch/pytorch/issues/48463 > Rather than requiring that users write import numbers and then use numbers.Float etc., this PEP proposes a straightforward shortcut that is almost as effective: when an argument is annotated as having type float, an argument of type int is acceptable; similar, for an argument annotated as having type complex, arguments of type float or int are acceptable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48584 Reviewed By: zhangguanheng66 Differential Revision: D25411080 Pulled By: malfet fbshipit-source-id: e00dc1e9e6e46a8cfae77da4f2cf159c0c2b9bcc --- torch/_tensor_str.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/_tensor_str.py b/torch/_tensor_str.py index 08c6cbc56ac6..5945713934ba 100644 --- a/torch/_tensor_str.py +++ b/torch/_tensor_str.py @@ -1,12 +1,12 @@ import math import torch from torch._six import inf -from typing import Union, Optional +from typing import Optional class __PrinterOptions(object): precision: int = 4 - threshold: Union[str, float] = 1000 + threshold: float = 1000 edgeitems: int = 3 linewidth: int = 80 sci_mode: Optional[bool] = None From 59a3e76641d89dcbf3abea001423468cb2f58745 Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Tue, 8 Dec 2020 20:12:41 -0800 Subject: [PATCH 062/250] [pt][quant] Remove contiguous calls in qembeddingbag (#48993) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48993 I don't see any reasons that we need to call contiguous on the embedding tables. They should not exist in the first place. The indices and lengths/offsets are actually generated in the model, but they're most likely generated by SigridTransform -> ClipRanges -> GatherRanges -> SigridHash (sometimes) and none of these ops produce non-contiguous tensors. It should be fine to enforce tensor.is_contiguous(). Reviewed By: radkris-git Differential Revision: D25266756 fbshipit-source-id: f15ecb67281c9ef0c7ac6637f439e538e77e30a2 --- .../ATen/native/quantized/cpu/fbgemm_utils.h | 16 ++++++---- .../native/quantized/cpu/qembeddingbag.cpp | 32 ++++++++++++------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index a2349790d117..b4cff64b309d 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -314,12 +314,16 @@ struct CAFFE2_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { int64_t bit_rate, c10::QScheme q_scheme, int64_t version) - : packed_w(std::move(packed_w)), - w_scale(std::move(w_scale)), - w_zp(std::move(w_zp)), - bit_rate_(bit_rate), - q_scheme(q_scheme), - version_(version) {} + : packed_w(std::move(packed_w)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + bit_rate_(bit_rate), + q_scheme(q_scheme), + version_(version) { + if (!packed_w.is_contiguous()) { + packed_w = packed_w.contiguous(); + } + } at::Tensor packed_w; std::vector w_scale; diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index 1c52242641e7..28f4c6a6eceb 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -369,12 +369,16 @@ at::Tensor embedding_bag_byte_helper( "Expect 32 or 64 bit offsets, but found ", offsets.scalar_type(), " instead."); + TORCH_CHECK( + weight.is_contiguous() && indices.is_contiguous() && + offsets.is_contiguous(), + "Expect weight, indices, and offsets to be contiguous."); // Using helper function to support different type combination without the // need to cast, which can be additional performance overhead if (indices.scalar_type() == at::kInt && offsets.scalar_type() == at::kInt) { return embedding_bag_byte_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -385,7 +389,7 @@ at::Tensor embedding_bag_byte_helper( } else if ( indices.scalar_type() == at::kInt && offsets.scalar_type() == at::kLong) { return embedding_bag_byte_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -396,7 +400,7 @@ at::Tensor embedding_bag_byte_helper( } else if ( indices.scalar_type() == at::kLong && offsets.scalar_type() == at::kInt) { return embedding_bag_byte_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -408,7 +412,7 @@ at::Tensor embedding_bag_byte_helper( // default case given the TORCH_CHECK above return embedding_bag_byte_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -458,12 +462,16 @@ at::Tensor embedding_bag_4bit_helper( "Expect 32 or 64 bit offsets, but found ", offsets.scalar_type(), " instead."); + TORCH_CHECK( + weight.is_contiguous() && indices.is_contiguous() && + offsets.is_contiguous(), + "Expect weight, indices, and offsets to be contiguous."); // Using helper function to support different type combination without the // need to cast, which can be additional performance overhead if (indices.scalar_type() == at::kInt && offsets.scalar_type() == at::kInt) { return embedding_bag_4bit_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -473,7 +481,7 @@ at::Tensor embedding_bag_4bit_helper( } else if ( indices.scalar_type() == at::kInt && offsets.scalar_type() == at::kLong) { return embedding_bag_4bit_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -483,7 +491,7 @@ at::Tensor embedding_bag_4bit_helper( } else if ( indices.scalar_type() == at::kLong && offsets.scalar_type() == at::kInt) { return embedding_bag_4bit_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -492,7 +500,7 @@ at::Tensor embedding_bag_4bit_helper( include_last_offset); } return embedding_bag_4bit_impl( - weight.contiguous(), + weight, indices, offsets, pruned_weights, @@ -511,7 +519,7 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_byte( bool include_last_offset, bool is_embedding_op) { return embedding_bag_byte_helper( - packed_w.contiguous(), + packed_w, indices, offsets_in, pruned_weights, @@ -538,7 +546,7 @@ at::Tensor PackedEmbeddingBagWeight::embeddingbag_4bit( } return embedding_bag_4bit_helper( - packed_w.contiguous(), + packed_w, indices, offsets_in, pruned_weights, @@ -564,7 +572,7 @@ Tensor embedding_bag_byte_rowwise_offsets( const c10::optional& compressed_indices_mapping, bool include_last_offset) { return embedding_bag_byte_helper( - weight.contiguous(), + weight, indices, offsets_in, pruned_weights, @@ -594,7 +602,7 @@ Tensor embedding_bag_4bit_rowwise_offsets( } return embedding_bag_4bit_helper( - weight.contiguous(), + weight, indices, offsets_in, pruned_weights, From 2d9585a6a10ea5717c6523086259391684c5aaf2 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 8 Dec 2020 22:25:03 -0800 Subject: [PATCH 063/250] [quant][graphmode][fx] Add test for ResnetBase (#48939) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48939 Add numerical test for fx graph mode for resnet base, comparing with eager mode Test Plan: Imported from OSS Reviewed By: supriyar Differential Revision: D25375342 fbshipit-source-id: 08f49b88daede47d44ee2ea96a02999fea246cb2 --- test/quantization/test_quantize.py | 5 +- test/quantization/test_quantize_fx.py | 59 +++++++++++++++++++ .../testing/_internal/common_quantization.py | 11 +++- 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py index 745437a86ca3..9e6379a29cec 100644 --- a/test/quantization/test_quantize.py +++ b/test/quantization/test_quantize.py @@ -352,10 +352,9 @@ def test_resnet_base(self): with override_quantized_engine(qengine): qconfig = torch.quantization.get_default_qconfig(qengine) model = ResNetBase().float().eval() + model.fuse_model() model = QuantWrapper(model) model.qconfig = qconfig - fuse_list = ['module.conv1', 'module.bn1', 'module.relu1'] - fuse_modules(model, fuse_list, inplace=True) model = prepare(model) self.checkObservers(model) test_only_eval_fn(model, self.img_data_2d) @@ -365,6 +364,8 @@ def checkQuantized(model): self.assertEqual(type(model.module.conv1), nn.intrinsic.quantized.ConvReLU2d) self.assertEqual(type(model.module.myop), nn.quantized.QFunctional) self.assertEqual(type(model.module.avgpool), nn.AdaptiveAvgPool2d) + self.assertEqual(type(model.module.fc), nnq.Linear) + test_only_eval_fn(model, self.img_data_2d) self.checkNoQconfig(model) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 11ceb920ea8c..f5f243a1e649 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -23,6 +23,7 @@ QuantType, QuantStub, DeQuantStub, + QuantWrapper, quant_type_to_str, default_qconfig, default_dynamic_qconfig, @@ -49,7 +50,13 @@ skip_if_no_torchvision, train_one_epoch, run_ddp, + test_only_eval_fn, + test_only_train_fn, +) + +from torch.testing._internal.common_quantization import ( LinearModelWithSubmodule, + ResNetBase, ) from torch.testing._internal.common_quantized import ( @@ -2219,6 +2226,58 @@ def _test_model_impl( ' should match. Mode: ' + mode + ' diff:' + str(diff_from_eager[mode][name])) + def _test_building_block(self, quant_type, BB): + eager = BB().float() + graph = copy.deepcopy(eager) + + if quant_type == QuantType.STATIC: + qconfig = default_qconfig + eager_prepare = prepare + graph_prepare = prepare_fx + eager.eval() + graph.eval() + calibrate_or_train = test_only_eval_fn + data = self.img_data_2d + else: + assert quant_type == QuantType.QAT + qconfig = default_qat_qconfig + eager_prepare = prepare_qat + graph_prepare = prepare_qat_fx + eager.train() + graph.train() + calibrate_or_train = test_only_train_fn + data = self.img_data_2d_train + + if hasattr(eager, "fuse_model"): + eager.fuse_model() + eager = QuantWrapper(eager) + eager.qconfig = qconfig + eager = eager_prepare(eager) + + qconfig_dict = {"": qconfig} + graph = graph_prepare(graph, qconfig_dict) + + eager_out = eager(data[0][0]) + graph_out = graph(data[0][0]) + self.assertEqual(eager_out, graph_out) + + calibrate_or_train(eager, data) + calibrate_or_train(graph, data) + + eager = convert(eager) + graph = convert_fx(graph) + + eager_out = eager(data[0][0]) + graph_out = graph(data[0][0]) + self.assertEqual(eager_out, graph_out) + + @override_qengines + def test_resnet_base(self): + models = [ResNetBase] + options = itertools.product(self.static_quant_types, models) + for quant_type, M in options: + self._test_building_block(quant_type, M) + @skip_if_no_torchvision @skipIfNoFBGEMM @unittest.skip("skip for now since tbb failed") diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py index 2e3cc16b4540..2ff28c8d30ad 100644 --- a/torch/testing/_internal/common_quantization.py +++ b/torch/testing/_internal/common_quantization.py @@ -742,8 +742,8 @@ def checkEmbeddingSerialization(self, qemb, num_embeddings, embedding_dim, indic self.assertTrue(expected_name in str(q_embeddingbag)) -# Below are a series of neural net models to use in testing quantization -# Single layer models +# Below are a series of toy models to use in testing quantization + class SingleLayerLinearModel(torch.nn.Module): def __init__(self): super().__init__() @@ -1350,7 +1350,7 @@ def __init__(self): self.downsample = torch.nn.Identity() self.myop = nn.quantized.FloatFunctional() self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - + self.fc = torch.nn.Linear(inplanes, 1) def forward(self, x): out = self.conv1(x) @@ -1360,8 +1360,13 @@ def forward(self, x): out = self.myop.add(out, identity) out = self.relu2(out) out = self.avgpool(out) + out = torch.flatten(out, 1) + out = self.fc(out) return out + def fuse_model(self): + torch.quantization.fuse_modules(self, [['conv1', 'bn1', 'relu1']], inplace=True) + class ModelMultipleOps(torch.nn.Module): def __init__(self): super().__init__() From 1c31f76297fc4526257a303b12b34a1d3a0625e5 Mon Sep 17 00:00:00 2001 From: Teng Gao Date: Wed, 9 Dec 2020 00:12:34 -0800 Subject: [PATCH 064/250] Add high level profiling trace for dataloading and optimizer (#47655) Summary: Fixes https://github.com/pytorch/pytorch/issues/47441 To give user more information about python level functions in profiler traces, we propose to instrument on the following functions: ``` _BaseDataLoaderIter.__next__ Optimizer.step Optimizer.zero_grad ``` Because the record_function already uses if (!active) to check whether the profiler is enabled, so we don't explicitly call torch.autograd._profiler_enabled() before each instrument. Acknowledgement: nbcsm, guotuofeng, gunandrose4u , guyang3532 , mszhanyi Pull Request resolved: https://github.com/pytorch/pytorch/pull/47655 Reviewed By: smessmer Differential Revision: D24960386 Pulled By: ilia-cher fbshipit-source-id: 2eb655789e2e2f506e1b8f95ad3d470c83281102 --- test/test_profiler.py | 96 ++++++++++++++++++++++++++++++++++ torch/autograd/__init__.py | 2 + torch/optim/optimizer.py | 45 ++++++++++++---- torch/utils/data/dataloader.py | 34 ++++++------ 4 files changed, 151 insertions(+), 26 deletions(-) diff --git a/test/test_profiler.py b/test/test_profiler.py index 797ad0995913..2cd6beaaaf53 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -4,6 +4,8 @@ import torch import torch.nn as nn +import torch.optim +import torch.utils.data from torch.testing._internal.common_utils import ( TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS) from torch.autograd.profiler import profile @@ -14,6 +16,7 @@ HAS_PSUTIL = True except ImportError: HAS_PSUTIL = False +import pickle @unittest.skipIf(not HAS_PSUTIL, "Requires psutil to run") @@ -129,5 +132,98 @@ def test_kineto(self): self.assertTrue(found_memcpy) # p.export_chrome_trace("/tmp/test_trace.json") + def test_high_level_trace(self): + """Checks that python side high level events are recorded. + """ + class RepeatedDataset(torch.utils.data.Dataset): + def __init__(self, N, D_in, D_out): + self.N = N + self.x = torch.randn(N, D_in) + self.y = torch.randn(N, D_out) + + def __len__(self): + return self.N + + def __getitem__(self, idx): + return self.x, self.y + + class TwoLayerNet(torch.nn.Module): + def __init__(self, D_in, H, D_out): + super(TwoLayerNet, self).__init__() + self.linear1 = torch.nn.Linear(D_in, H) + self.linear2 = torch.nn.Linear(H, D_out) + + def forward(self, x): + h_relu = self.linear1(x).clamp(min=0) + y_pred = self.linear2(h_relu) + return y_pred + + class CustomSGD(torch.optim.SGD): + def __init__(self, *args, **kwargs): + super(CustomSGD, self).__init__(*args, **kwargs) + + def train(): + for _, data in enumerate(dataloader): + x, y = data[0], data[1] + y_pred = model(x) + loss = criterion(y_pred, y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + N, D_in, H, D_out = 8, 10, 5, 2 + model = TwoLayerNet(D_in, H, D_out) + criterion = torch.nn.MSELoss(reduction='sum') + optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) + ds = RepeatedDataset(N, D_in, D_out) + dataloader = torch.utils.data.DataLoader(ds, batch_size=1) + + try: + train() + except Exception: + self.assertTrue(False, "Expected no exception without profiling.") + + # Create multiple instances, expect each func is hooked only one time. + # Nested wrappers(repeated patching) will make following test fail. + optimizer_duplicate = torch.optim.SGD(model.parameters(), lr=1e-4) + dataloader_duplicate = torch.utils.data.DataLoader(ds, batch_size=1) + + def judge(expected_event_count, prof): + actual_event_count = {} + for e in prof.function_events: + if "#" in e.name: + key = e.name + if key in expected_event_count.keys(): + actual_event_count[key] = actual_event_count.setdefault(key, 0) + 1 + for key, count in expected_event_count.items(): + self.assertTrue((key in actual_event_count.keys()) and (count == actual_event_count[key])) + + with profile() as prof: + train() + expected_event_count = { + # "+1" because the final iteration will enter __next__ but skip the loop body. + "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1), + "Optimizer.step#SGD.step": N, + "Optimizer.zero_grad#SGD.zero_grad": N + } + judge(expected_event_count, prof) + + # Test on pickle/unpickle. Expect to work in multi-processing. + optimizer = pickle.loads(pickle.dumps(optimizer)) + with profile() as prof: + train() + judge(expected_event_count, prof) + + # Test on customized optimizer. + optimizer = CustomSGD(model.parameters(), lr=1e-4) + with profile() as prof: + train() + expected_event_count = { + "enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1), + "Optimizer.step#CustomSGD.step": N, + "Optimizer.zero_grad#CustomSGD.zero_grad": N + } + judge(expected_event_count, prof) + if __name__ == '__main__': run_tests() diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index 71537c562013..380b24edfaab 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -257,3 +257,5 @@ def variable(*args, **kwargs): if kineto_available(): from torch._C._autograd import (ProfilerResult, KinetoEvent, _prepare_profiler, _enable_profiler, _disable_profiler) + +from . import profiler diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py index 7d413b959415..0a302008cd22 100644 --- a/torch/optim/optimizer.py +++ b/torch/optim/optimizer.py @@ -5,6 +5,7 @@ from copy import deepcopy from itertools import chain import warnings +import functools class _RequiredParameter(object): @@ -34,6 +35,8 @@ def __init__(self, params, defaults): torch._C._log_api_usage_once("python.optimizer") self.defaults = defaults + self._hook_for_profile() + if isinstance(params, torch.Tensor): raise TypeError("params argument given to the optimizer should be " "an iterable of Tensors or dicts, but got " + @@ -60,6 +63,7 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) + self._hook_for_profile() # To support multiprocessing pickle/unpickle. def __repr__(self): format_string = self.__class__.__name__ + ' (' @@ -72,6 +76,24 @@ def __repr__(self): format_string += ')' return format_string + def _hook_for_profile(self): + self._zero_grad_profile_name = "Optimizer.zero_grad#{}.zero_grad".format(self.__class__.__name__) + + def profile_hook_step(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + obj, *_ = args + profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__) + with torch.autograd.profiler.record_function(profile_name): + return func(*args, **kwargs) + return wrapper + + hooked = getattr(self.__class__.step, "hooked", None) + if not hooked: + self.__class__.step = profile_hook_step(self.__class__.step) + self.__class__.step.hooked = True + def state_dict(self): r"""Returns the state of the optimizer as a :class:`dict`. @@ -179,17 +201,20 @@ def zero_grad(self, set_to_none: bool = False): (in one case it does the step with a gradient of 0 and in the other it skips the step altogether). """ - for group in self.param_groups: - for p in group['params']: - if p.grad is not None: - if set_to_none: - p.grad = None - else: - if p.grad.grad_fn is not None: - p.grad.detach_() + if not hasattr(self, "_zero_grad_profile_name"): + self._hook_for_profile() + with torch.autograd.profiler.record_function(self._zero_grad_profile_name): + for group in self.param_groups: + for p in group['params']: + if p.grad is not None: + if set_to_none: + p.grad = None else: - p.grad.requires_grad_(False) - p.grad.zero_() + if p.grad.grad_fn is not None: + p.grad.detach_() + else: + p.grad.requires_grad_(False) + p.grad.zero_() def step(self, closure): r"""Performs a single optimization step (parameter update). diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 1eb60c81f7d0..a46d01797f16 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -498,6 +498,7 @@ def __init__(self, loader: DataLoader) -> None: self._base_seed = torch.empty((), dtype=torch.int64).random_(generator=loader.generator).item() self._persistent_workers = loader.persistent_workers self._num_yielded = 0 + self._profile_name = "enumerate(DataLoader)#{}.__next__".format(self.__class__.__name__) def __iter__(self) -> '_BaseDataLoaderIter': return self @@ -514,22 +515,23 @@ def _next_data(self): raise NotImplementedError def __next__(self) -> Any: - if self._sampler_iter is None: - self._reset() - data = self._next_data() - self._num_yielded += 1 - if self._dataset_kind == _DatasetKind.Iterable and \ - self._IterableDataset_len_called is not None and \ - self._num_yielded > self._IterableDataset_len_called: - warn_msg = ("Length of IterableDataset {} was reported to be {} (when accessing len(dataloader)), but {} " - "samples have been fetched. ").format(self._dataset, self._IterableDataset_len_called, - self._num_yielded) - if self._num_workers > 0: - warn_msg += ("For multiprocessing data-loading, this could be caused by not properly configuring the " - "IterableDataset replica at each worker. Please see " - "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples.") - warnings.warn(warn_msg) - return data + with torch.autograd.profiler.record_function(self._profile_name): + if self._sampler_iter is None: + self._reset() + data = self._next_data() + self._num_yielded += 1 + if self._dataset_kind == _DatasetKind.Iterable and \ + self._IterableDataset_len_called is not None and \ + self._num_yielded > self._IterableDataset_len_called: + warn_msg = ("Length of IterableDataset {} was reported to be {} (when accessing len(dataloader)), but {} " + "samples have been fetched. ").format(self._dataset, self._IterableDataset_len_called, + self._num_yielded) + if self._num_workers > 0: + warn_msg += ("For multiprocessing data-loading, this could be caused by not properly configuring the " + "IterableDataset replica at each worker. Please see " + "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples.") + warnings.warn(warn_msg) + return data next = __next__ # Python 2 compatibility From e8b00023b2ac46e25f2e00592d5f8c38ff53278a Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Wed, 9 Dec 2020 00:38:47 -0800 Subject: [PATCH 065/250] [ROCm] restore autograd tests (#48431) Summary: Fixes https://github.com/pytorch/pytorch/issues/30845. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48431 Reviewed By: zhangguanheng66 Differential Revision: D25393323 Pulled By: mruberry fbshipit-source-id: 339644abf4ad52be306007f4040c692a45998052 --- test/test_autograd.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 7c2082b1ed1d..796860cf639f 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -29,7 +29,7 @@ record_function, emit_nvtx) import torch.autograd.functional as autogradF from torch.utils.checkpoint import checkpoint -from torch.testing._internal.common_utils import (TEST_WITH_ROCM, TestCase, run_tests, skipIfNoLapack, +from torch.testing._internal.common_utils import (TestCase, run_tests, skipIfNoLapack, suppress_warnings, slowTest, load_tests, random_symmetric_matrix, IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck) @@ -6181,10 +6181,6 @@ def test_min_max_median_backprops_to_all_values(self, device): self.assertEqual(x.grad.sum(), 1.) self.assertEqual((x.grad == 1 / 3).sum(), 3) - # skip this test if running on rocm, because in cdist - # we use __shfl_down_sync on CUDA for fast reduction - # and it gives incorrect results on rocm platform - @skipCUDAIfRocm def test_cdist(self, device): def _test_cdist_for_size(sizex, sizey=None): if sizey is None: @@ -6268,8 +6264,6 @@ def test_parameter_resize(self, device): m = torch.cat((asd, asd)) m.sum().backward() - # NOTE: flaky on ROCm CI - @skipCUDAIfRocm def test_sparse_ctor_getter_backward(self, device): # See NOTE [ Sparse: autograd and API ] on the expected behavior of this test def _test(size, sparse_dim, nnz, device): @@ -6590,7 +6584,6 @@ def test_ctc_loss_cudnn(self, device): grad_cudnn, = torch.autograd.grad(loss_cudnn, log_probs, grad_out) self.assertEqual(grad_cudnn, grad_native, atol=1e-4, rtol=0) - @skipCUDAIfRocm def test_leaky_relu_inplace_with_neg_slope(self, device): a = torch.tensor([-1., 1.], device=device, requires_grad=True) b = torch.nn.functional.leaky_relu_(a.clone(), -2) @@ -6602,7 +6595,6 @@ def test_leaky_relu_inplace_with_neg_slope(self, device): with self.assertRaisesRegex(RuntimeError, "call out-of-place version"): b.backward(torch.ones(2, device=device)) - @skipCUDAIfRocm def test_leaky_relu_inplace_with_zero_slope(self, device): a = torch.tensor([-2., 0., 2.], device=device, requires_grad=True) b = torch.nn.functional.leaky_relu_(a.clone(), 0.0) @@ -7325,9 +7317,7 @@ def backward(ctx, *grad): instantiate_device_type_tests( TestAutogradDeviceType, globals(), - # Exclude ROCM for now, there are a lot of failures. See - # https://github.com/pytorch/pytorch/issues/30845 - except_for='cuda' if TEST_WITH_ROCM else None + except_for=None ) if __name__ == '__main__': From a849f3822232770bc46433623271b52b8d397166 Mon Sep 17 00:00:00 2001 From: X Wang <24860335+xwang233@users.noreply.github.com> Date: Wed, 9 Dec 2020 00:42:12 -0800 Subject: [PATCH 066/250] skip cuda test_cholesky_solve_batched_many_batches due to illegal memory access (#48999) Summary: See https://github.com/pytorch/pytorch/issues/48996 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48999 Reviewed By: zhangguanheng66 Differential Revision: D25390070 Pulled By: mruberry fbshipit-source-id: cf59130f6189ab8c2dade6a6a4de2f69753a5e36 --- test/test_linalg.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_linalg.py b/test/test_linalg.py index b6ff817a59fa..062ee33b3ec4 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -1895,6 +1895,7 @@ def test_cholesky_solve_batched_non_contiguous(self, device, dtype): self.assertEqual(x, x_exp) @slowTest + @skipCUDAIf(True, "See https://github.com/pytorch/pytorch/issues/48996") @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) From a20d4511e40264a900c62ff9631fc063c40ef41e Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 9 Dec 2020 00:46:41 -0800 Subject: [PATCH 067/250] [PyTorch] TensorImpl::is_non_overlapping_and_dense_ should default to true (#48625) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48625 The default TensorImpl is contiguous. Therefore, it is non-overlapping and dense per refresh_contiguous(). ghstack-source-id: 118035410 Test Plan: CI Reviewed By: ezyang Differential Revision: D25232196 fbshipit-source-id: 1968d9ed444f2ad5414a78d0b11e5d3030e3109d --- aten/src/ATen/SparseTensorImpl.cpp | 2 ++ c10/core/TensorImpl.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 45492d7b212e..8d7d4b2ce0f8 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -46,6 +46,8 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ AT_ASSERT(values_.sizes() == IntArrayRef({0})); AT_ASSERT(values_.device() == indices_.device()); AT_ASSERT(values_.device() == device()); + + is_non_overlapping_and_dense_ = false; } IntArrayRef SparseTensorImpl::strides() const { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 269976a7e148..5deab2a09832 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -1706,7 +1706,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { is_channels_last_contiguous_ = false; is_channels_last_3d_ = false; is_channels_last_3d_contiguous_ = false; - is_non_overlapping_and_dense_ = false; + is_non_overlapping_and_dense_ = true; is_wrapped_number_ = false; allow_tensor_metadata_change_ = true; reserved_ = false; From 09b974c2d5e25432d0e4c676ee16cecdea20c7e5 Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Wed, 9 Dec 2020 02:27:24 -0800 Subject: [PATCH 068/250] Extra sampling of record function events (#48289) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48289 Adding extra sampling step when dispatching RecordFunction. (Note: this ignores all push blocking failures!) Reviewed By: swolchok Differential Revision: D25111515 Pulled By: ilia-cher fbshipit-source-id: 0d572a3636fe649a47ec47901826bbfc08368937 --- aten/src/ATen/ThreadLocalState.cpp | 1 + aten/src/ATen/ThreadLocalState.h | 24 ++++- aten/src/ATen/core/dispatch/Dispatcher.h | 81 +++++++++------- aten/src/ATen/record_function.cpp | 114 ++++++++++++++++++----- aten/src/ATen/record_function.h | 27 +++++- binaries/record_function_benchmark.cc | 101 ++++++++++---------- torch/csrc/autograd/function.h | 39 ++++---- torch/csrc/jit/runtime/interpreter.cpp | 5 +- 8 files changed, 265 insertions(+), 127 deletions(-) diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 6d74e2f47ce0..3c7b9b6ff5bc 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -19,6 +19,7 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode) grad_mode_enabled_ = GradMode::is_enabled(); } #endif + bumped_record_all_functions_ = at::checkRecordAllFunctions(); } /* static */ diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index f0cb85f0ff84..3c9b55b3d8d6 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -38,6 +38,9 @@ class TORCH_API ThreadLocalState { bool grad_mode_enabled_; #endif + // Whether pre-sampling RecordFunction optimization was enabled + bool bumped_record_all_functions_ = false; + friend class ThreadLocalStateGuard; }; @@ -45,7 +48,21 @@ class TORCH_API ThreadLocalState { class TORCH_API ThreadLocalStateGuard { public: explicit ThreadLocalStateGuard(const ThreadLocalState& state) - : prev_state_(ThreadLocalState()) { + : prev_state_(ThreadLocalState()), + bumped_record_all_functions_(state.bumped_record_all_functions_) { + // Special handling of RecordFunction pre-sampling optimization: + // pre-samping is enabled (bumped) when there're non-sampled + // (or high-frequency) global or TLS callbacks. + // + // ThreadLocalStateGuard simply resets RecordFunction's TLS and + // hence its thread local callbacks. + // + // Checking if the pre-sampling was enabled and preserving it in the + // async task by calling bumpRecordAllFunctions() and the corresponding + // releaseRecordAllFunctions() + if (bumped_record_all_functions_) { + at::bumpRecordAllFunctions(); + } // set the given state across the thread boundary ThreadLocalState::setThreadLocalState(state); } @@ -53,10 +70,15 @@ class TORCH_API ThreadLocalStateGuard { ~ThreadLocalStateGuard() { // restore previously set variables ThreadLocalState::setThreadLocalState(prev_state_); + if (bumped_record_all_functions_) { + at::releaseRecordAllFunctions(); + } } private: const ThreadLocalState prev_state_; + // Whether pre-sampling RecordFunction optimization was enabled + bool bumped_record_all_functions_ = false; }; template diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 632739053c42..f83302e2d819 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -371,28 +371,39 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandleop.lookup(dispatchKey); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - // Check if we need to run callbacks registered with RecordFunction - // If true and callbacks need inputs, we box the arguments and pass - // them into the callbacks and also into the kernel call - - // Note: for perf reasons we wouldn't want to pass arguments into - // the function call or prematurely box them - at::RecordFunction guard(at::RecordScope::FUNCTION); - if (C10_UNLIKELY(guard.isActive())) { - if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) { - int64_t seq_num = -1; - // Setting sequence number in the Autograd case to associate - // the forward range with the coresponding Autograd's node - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { - seq_num = at::sequence_number::peek(); - } - if (guard.needsInputs()) { - torch::jit::Stack stack = impl::boxArgs(args...); - guard.before(op, stack, seq_num); - } else { - guard.before(op, seq_num); + // By default, when there're no high-frequency or non-sampled callbacks, + // RecordFunction is pre-sampled as a perf optimization; + // shouldRunRecordFunction checks whether RecordFunction should be executed, + // and sets pre_sampled boolean argument value to whether pre-sampling was used - + // this boolean is passed into RecordFunction to adjust the sampling rates of + // the callbacks + bool pre_sampled = false; + if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { + // Check if we need to run callbacks registered with RecordFunction + // If true and callbacks need inputs, we box the arguments and pass + // them into the callbacks and also into the kernel call + + // Note: for perf reasons we wouldn't want to pass arguments into + // the function call or prematurely box them + at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); + if (C10_UNLIKELY(guard.isActive())) { + if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) { + int64_t seq_num = -1; + // Setting sequence number in the Autograd case to associate + // the forward range with the coresponding Autograd's node + if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + seq_num = at::sequence_number::peek(); + } + if (guard.needsInputs()) { + torch::jit::Stack stack = impl::boxArgs(args...); + guard.before(op, stack, seq_num); + } else { + guard.before(op, seq_num); + } } } + // keeping the guard alive while executing the kernel + return kernel.template call(op, std::forward(args)...); } #endif // PYTORCH_DISABLE_PER_OP_PROFILING return kernel.template call(op, std::forward(args)...); @@ -429,20 +440,26 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const const auto& kernel = entry.lookup(dispatchKey); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - // using already existing stack to record function execution in observers - at::RecordFunction guard(at::RecordScope::FUNCTION); - if (C10_UNLIKELY(guard.isActive())) { - if (shouldRecord(dispatchKey) && entry.isObserved()) { - int64_t seq_num = -1; - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { - seq_num = at::sequence_number::peek(); - } - if (guard.needsInputs()) { - guard.before(op, *stack, seq_num); - } else { - guard.before(op, seq_num); + bool pre_sampled = false; + if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { + // using already existing stack to record function execution in observers + at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); + if (C10_UNLIKELY(guard.isActive())) { + if (shouldRecord(dispatchKey) && entry.isObserved()) { + int64_t seq_num = -1; + if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + seq_num = at::sequence_number::peek(); + } + if (guard.needsInputs()) { + guard.before(op, *stack, seq_num); + } else { + guard.before(op, seq_num); + } } } + // keeping the guard alive while executing the kernel + kernel.callBoxed(op, stack); + return; } #endif // PYTORCH_DISABLE_PER_OP_PROFILING kernel.callBoxed(op, stack); diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index 102931fd4aa7..32897dd7783f 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -30,8 +30,6 @@ std::atomic defaultNodeId(-1); std::atomic next_thread_id_ {0}; thread_local uint64_t current_thread_id_ = 0; -thread_local bool tls_record_function_enabled_ = true; - // Low probability constant static const double kLowProb = 0.001; struct CoinflipTLS { @@ -68,6 +66,10 @@ void set_record_function_tls_(const RecordFunctionTLS& tls) { class CallbackManager { public: CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) { + if (cb.samplingProb() > kLowProb) { + // pre-sampling of RecordFunction with prob. kLowProb cannot be used + at::bumpRecordAllFunctions(); + } // note: monotonically increasing callbacks_unique_id keeps // sorted_tls_callbacks_ sorted auto handle = next_unique_callback_handle(); @@ -76,6 +78,10 @@ class CallbackManager { } CallbackHandle addGlobalCallback(RecordFunctionCallback cb) { + if (cb.samplingProb() > kLowProb) { + // pre-sampling of RecordFunction with prob. kLowProb cannot be used + at::bumpRecordAllFunctions(); + } auto handle = next_unique_callback_handle(); sorted_global_callbacks_.emplace_back(std::move(cb), handle); return handle; @@ -92,6 +98,10 @@ class CallbackManager { return el.second == handle; }); if (it != cbs.end()) { + if (it->first.samplingProb() > kLowProb) { + // try to restore pre-sampling of RecordFunction + at::releaseRecordAllFunctions(); + } // keeps it sorted cbs.erase(it); return true; @@ -127,7 +137,13 @@ class CallbackManager { // callbackShouldRun is even hotter because it's called multiple // times per init(). Profiling shows that the function prologue is // taking up a significant fraction of the time. - static bool C10_ALWAYS_INLINE callbackShouldRun(const RecordFunctionCallback& cb, RecordScope scope) { + static bool C10_ALWAYS_INLINE callbackShouldRun( + const RecordFunctionCallback& cb, RecordScope scope, bool pre_sampled) { + TORCH_INTERNAL_ASSERT( + !pre_sampled || (cb.sampling_prob_ <= kLowProb), + "Incorrect usage of a pre-sampled RecordFunction with a high-frequency " + " or non-sampled callback"); + // first check whether this callback is interested in // the given scope type if (!cb.checkScope(scope)) { @@ -138,36 +154,45 @@ class CallbackManager { return cb.should_run_(cb); } - if (cb.sampling_prob_ == 1.0) { - return true; + // otherwise potentially do the sampling + double sampling_prob = cb.sampling_prob_; + if (pre_sampled) { + // adjust the sampling rate to account for kLowProb pre-sampling of + // the RecordFunction + sampling_prob /= kLowProb; } - // model the low probability events as events happening - // with probability kLowProb followed by another sampling with - // probability (sampling_prob__ / kLowProb), then replace the coin - // flip for kLowProb with a thread local number of tries tries_left_ - // sampled from the geometric distribution. - if (cb.sampling_prob_ < kLowProb) { - if (coinflip_tls_.tries_left_ == 0) { - coinflip_tls_.tries_left_ = sample_geometric(); - return (sample_zero_one() < cb.sampling_prob_ / kLowProb); + + if (sampling_prob < 1.0) { + // model the low probability events as events happening + // with probability kLowProb followed by another sampling with + // probability (sampling_prob / kLowProb), then replace the coin + // flip for kLowProb with a thread local number of tries tries_left_ + // sampled from the geometric distribution. + if (sampling_prob < kLowProb) { + if (coinflip_tls_.tries_left_ == 0) { + coinflip_tls_.tries_left_ = sample_geometric(); + return (sample_zero_one() < sampling_prob / kLowProb); + } else { + --coinflip_tls_.tries_left_; + return false; + } } else { - --coinflip_tls_.tries_left_; - return false; + return (sample_zero_one() < sampling_prob); } - } else { - return (sample_zero_one() < cb.sampling_prob_); } + + return true; } // init is called by RecordFunction in constructor to // determine which thread local and global callbacks are going // to be executed and whether any of them need inputs - inline void init(RecordFunction& rec_fn, RecordScope scope) { + inline void init(RecordFunction& rec_fn, RecordScope scope, bool pre_sampled) { bool found_needs_inputs = false; bool found_needs_ids = false; for (const auto& cb: rf_tls_.sorted_tls_callbacks_) { - if (callbackShouldRun(cb.first, scope)) { + if (callbackShouldRun(cb.first, scope, pre_sampled)) { if (cb.first.needsInputs()) { found_needs_inputs = true; } @@ -182,7 +207,7 @@ class CallbackManager { } for (const auto& cb: sorted_global_callbacks_) { - if (callbackShouldRun(cb.first, scope)) { + if (callbackShouldRun(cb.first, scope, pre_sampled)) { if (cb.first.needsInputs()) { found_needs_inputs = true; } @@ -308,7 +333,6 @@ namespace { } } // namespace - RecordFunctionCallbacks _getTLSCallbacks() { return rf_tls_.sorted_tls_callbacks_; } @@ -374,12 +398,12 @@ void enableRecordFunction(bool enable) { rf_tls_.tls_record_function_enabled_ = enable; } -RecordFunction::RecordFunction(RecordScope scope) { +RecordFunction::RecordFunction(RecordScope scope, bool pre_sampled) { auto* rf_tls_ptr = &rf_tls_; if (rf_tls_ptr->tls_record_function_enabled_) { auto& m = manager(); if (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty()) { - m.init(*this, scope); + m.init(*this, scope, pre_sampled); } } } @@ -451,4 +475,46 @@ void RecordFunction::end() { } } +// RecordFunction pre-sampling +namespace { +// Whether to try to create RecordFunction on each call (>0) or +// use pre-sampling (=0) +std::atomic global_record_all_functions_ {0}; +} + +void bumpRecordAllFunctions() { + global_record_all_functions_.fetch_add(1, std::memory_order_relaxed); +} + +void releaseRecordAllFunctions() { + TORCH_CHECK(global_record_all_functions_.fetch_sub(1, std::memory_order_relaxed) >= 0); +} + +bool checkRecordAllFunctions() { + return (global_record_all_functions_.load(std::memory_order_relaxed) > 0); +} + +bool shouldRunRecordFunction(bool* pre_sampled) { + auto* rf_tls_ptr = &rf_tls_; + if (!rf_tls_ptr->tls_record_function_enabled_) { + *pre_sampled = false; + return false; + } + + if (global_record_all_functions_.load(std::memory_order_relaxed) > 0) { + *pre_sampled = false; + return true; + } + + *pre_sampled = true; + auto* coinflip_tls_ptr = &coinflip_tls_; + if (coinflip_tls_ptr->tries_left_ == 0) { + coinflip_tls_ptr->tries_left_ = sample_geometric(); + return true; + } else { + --coinflip_tls_ptr->tries_left_; + return false; + } +} + } // namespace at diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index 4b07d13aa747..843e3850d498 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -90,8 +90,11 @@ typedef uint64_t RecordFunctionHandle; struct TORCH_API RecordFunction { // Default constructor is used with before function called afterwards: // scope - record scope that this function tracks + // pre_sampled - whether this RecordFunction was already pre-sampled with + // kLowProb probability RecordFunction( - RecordScope scope = RecordScope::FUNCTION); + RecordScope scope = RecordScope::FUNCTION, + bool pre_sampled = false); template void before( @@ -238,6 +241,9 @@ struct TORCH_API RecordFunction { // flag is used to check whether the start callbacks were called bool called_start_callbacks_ = false; + // Whether the RecordFunction is pre-sampled + bool pre_sampled_ = false; + // Used internally to keep track of thread local and global callbacks // that were picked to run; must be sorted; CallbackHandles sorted_active_tls_handles_; @@ -330,7 +336,7 @@ class TORCH_API RecordFunctionCallback { } RecordFunctionCallback& samplingProb(double sampling_prob) { - TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob_ <= 1.0, + TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0, "Invalid sampling probability"); sampling_prob_ = sampling_prob; return *this; @@ -544,10 +550,27 @@ struct TORCH_API RecordFunctionTLS { RecordFunctionCallbacks sorted_tls_callbacks_; bool tls_record_function_enabled_ = true; + + // Stores the number of coin flips before the next successful coin flip + int tries_left_ = 0; }; TORCH_API const RecordFunctionTLS& get_record_function_tls_(); TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls); +// Checks whether RecordFunction should be called, +// sets boolean pointed by the argument to whether pre-sampling was used +TORCH_API bool shouldRunRecordFunction(bool*); + +// The following functions are used to disable/enable pre-sampling of RecordFunction +// when high-frequency/non-sampled callbacks are added/removed. +// Note: every call to bumpRecordAllFunctions() is supposed to be matched with +// the corresponding releaseRecordAllFunctions() call. +// Note: disabling pre-sampling of RecordFunction incurs an extra overhead, since +// RecordFunction will be created for each operator call. +TORCH_API void bumpRecordAllFunctions(); +TORCH_API void releaseRecordAllFunctions(); +TORCH_API bool checkRecordAllFunctions(); + } // namespace at diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index d924003b9270..53a8bd16f43d 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -7,61 +7,55 @@ #include #include -C10_DEFINE_int(iter, 100, "Number of iterations"); -C10_DEFINE_int(warmup_iter, 10, "Number of warmup iterations"); +C10_DEFINE_int(iter, 10000, "Number of iterations"); C10_DEFINE_int(sampled_iter, 10e6, "Number of iterations for the sampled observer benchmark"); namespace { -const int kInnerIter = 100; -const int kNumSampledCb = 2; const int kTensorSize = 16; const int kSmallTensorSize = 1; -const float kSampingProb = 0.1; - const float kLowSamplingProb = 0.0001; } -void setupBenchmarkCallbacks() { - at::enableRecordFunction(); - at::clearCallbacks(); - // non-sampled callback - at::addGlobalCallback(at::RecordFunctionCallback( - [&](const at::RecordFunction& fn) {}, +void addTestCallback( + double sampling_prob = 1.0, + std::function fn = + [](const at::RecordFunction&) {}) { + auto cb = at::RecordFunctionCallback( + std::move(fn), [](const at::RecordFunction&) {}) - .needsInputs(true)); - - // sampled - for (auto idx = 0; idx < kNumSampledCb; ++idx) { - at::addGlobalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) {}, - [](const at::RecordFunction&) {}) - .needsInputs(true) - .samplingProb(kSampingProb) - ); + .needsInputs(false); + if (sampling_prob < 1.0) { + cb.samplingProb(sampling_prob); } + at::addGlobalCallback(cb); } -float runTensorBench(int tensor_size, int outer_iter) { +float runTensorGEMMBench(int tensor_size, int iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); - for (auto idx = 0; idx < kInnerIter * outer_iter; ++idx) { - torch::mm( - torch::randn({tensor_size, tensor_size}), - torch::randn({tensor_size, tensor_size})); + auto inp = torch::randn({tensor_size, tensor_size}); + for (auto idx = 0; idx < iter; ++idx) { + torch::mm(inp, inp); } auto duration = static_cast( std::chrono::duration_cast(clock::now() - start_time).count()); return duration; } -float runPureRecordFunctionBench(int outer_iter) { +float runPureRecordFunctionBench(int iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); - for (auto n = 0; n < outer_iter; ++n) { - RECORD_USER_SCOPE("test"); + for (auto idx = 0; idx < iter; ++idx) { + bool pre_sampled = false; + if (at::shouldRunRecordFunction(&pre_sampled)) { + at::RecordFunction guard(at::RecordScope::USER_SCOPE, pre_sampled); + if (C10_UNLIKELY(guard.isActive())) { + guard.before("Test", -1); + } + } } auto duration = static_cast( std::chrono::duration_cast(clock::now() - start_time).count()); @@ -71,18 +65,19 @@ float runPureRecordFunctionBench(int outer_iter) { void runBenchmark() { float duration = 0; for (auto tensor_size : std::set({kSmallTensorSize, kTensorSize})) { - duration = runTensorBench(tensor_size, FLAGS_iter); - std::cout << "Running tensor benchmark, time per iteration (" + duration = runTensorGEMMBench(tensor_size, FLAGS_iter); + std::cout << "Tensor GEMM benchmark (" << tensor_size << "x" << tensor_size - << "): " << (duration/FLAGS_iter) + << ", " << FLAGS_iter << "): " << duration << " us." << std::endl; } - duration = runPureRecordFunctionBench(FLAGS_iter * 100); - std::cout << "Running pure RecordFunction benchmark, time per iteration: " - << (duration/FLAGS_iter) - << " us." << std::endl; + duration = runPureRecordFunctionBench(FLAGS_iter); + std::cout << "Pure RecordFunction benchmark (" + << FLAGS_iter << "): " + << duration + << " us." << std::endl; } int main(int argc, char** argv) { @@ -91,32 +86,38 @@ int main(int argc, char** argv) { return -1; } - auto duration = runTensorBench(kSmallTensorSize, FLAGS_warmup_iter); - std::cout << "Warmup time: " << duration << " us." << std::endl; + at::enableRecordFunction(); + at::clearCallbacks(); - setupBenchmarkCallbacks(); - std::cout << "Running with empty observers" << std::endl; + std::cout << "Warm up" << std::endl; runBenchmark(); - at::clearCallbacks(); std::cout << "Running without observers" << std::endl; runBenchmark(); - std::cout << "Running sampled observer benchmark" << std::endl; + addTestCallback(); + std::cout << "Running with empty non-sampled observer" << std::endl; + runBenchmark(); + at::clearCallbacks(); + + addTestCallback(kLowSamplingProb); + std::cout << "Running with empty sampled observer" << std::endl; + runBenchmark(); + at::clearCallbacks(); + + std::cout << "Checking number of sampled observer invocations" << std::endl; int cb_count = 0; - at::addGlobalCallback(at::RecordFunctionCallback( + addTestCallback( + kLowSamplingProb, [&](const at::RecordFunction& fn) { ++cb_count; - }, - [](const at::RecordFunction&) {}) - .needsInputs(true) - .samplingProb(kLowSamplingProb) + } ); - runPureRecordFunctionBench(FLAGS_sampled_iter); + auto duration = runPureRecordFunctionBench(FLAGS_sampled_iter); std::cout << "Pure RecordFunction runtime of " << FLAGS_sampled_iter - << " iterations " << duration + << " iterations: " << duration << " us, number of callback invocations: " << cb_count << ", expected number: ~" << (int)(FLAGS_sampled_iter * kLowSamplingProb) << " invocations" << std::endl; diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 09dc048f214b..44171e1a3b1b 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -133,26 +133,33 @@ struct TORCH_API Node : std::enable_shared_from_this { /// Evaluates the function on the given inputs and returns the result of the /// function call. variable_list operator()(variable_list&& inputs) { - // Using RecordFunction to trogger observers in the backward pass - at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION); - if (guard.isActive()) { - // Using sequence number and thread id to correlate with - // the forward pass function - guard.setForwardThreadId(thread_id_); - if (guard.needsInputs()) { - guard.before( - name(), - std::vector(inputs.begin(), inputs.end()), - sequence_nr()); - } else { - guard.before(name(), sequence_nr()); - } - } // In the first iteration of named tensors, autograd ignores names and // operates on unnamed tensors. In the long term, autograd should // probably operate with names. at::NoNamesGuard no_names_guard; - return apply(std::move(inputs)); + + bool pre_sampled = false; + if (at::shouldRunRecordFunction(&pre_sampled)) { + // Using RecordFunction to trogger observers in the backward pass + at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION, pre_sampled); + if (guard.isActive()) { + // Using sequence number and thread id to correlate with + // the forward pass function + guard.setForwardThreadId(thread_id_); + if (guard.needsInputs()) { + guard.before( + name(), + std::vector(inputs.begin(), inputs.end()), + sequence_nr()); + } else { + guard.before(name(), sequence_nr()); + } + } + // keeping stack guard object alive during the call + return apply(std::move(inputs)); + } else { + return apply(std::move(inputs)); + } } // Graph Connectivity API diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index ef0f2dae9e0e..d07da80e4cd3 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1607,10 +1607,11 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { } static void checkAndStartRecordFunction(Frame& frame, Stack& stack) { + bool pre_sampled = false; if (!frame.record_function && at::hasCallbacks() && - at::isRecordFunctionEnabled()) { + at::shouldRunRecordFunction(&pre_sampled)) { auto rec_fn = std::make_unique( - at::RecordScope::TORCHSCRIPT_FUNCTION); + at::RecordScope::TORCHSCRIPT_FUNCTION, pre_sampled); if (rec_fn->isActive()) { if (rec_fn->needsInputs()) { rec_fn->before( From 71cfb73755f6e978828683a54468ce48fb44d44f Mon Sep 17 00:00:00 2001 From: Sidney Fletcher Date: Wed, 9 Dec 2020 05:09:42 -0800 Subject: [PATCH 069/250] Add complex support to broadcast_coalesced (#48686) Summary: Fixes https://github.com/pytorch/pytorch/issues/47330 Add support for DataParallel complex tensors by handling them as `torch.view_as_real` for `broadcast_coalesced`, `scatter` and `gather` Pull Request resolved: https://github.com/pytorch/pytorch/pull/48686 Reviewed By: osalpekar Differential Revision: D25261533 Pulled By: sidneyfletcher fbshipit-source-id: 3a25e05deee43e053f40d1068fc5c7867cfa9686 --- test/distributed/test_data_parallel.py | 20 ++++++++++++++++++++ torch/_utils.py | 10 ++++++++++ torch/nn/parallel/comm.py | 8 ++++++-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py index 1bfa3922bd94..f3161a1f8cb1 100644 --- a/test/distributed/test_data_parallel.py +++ b/test/distributed/test_data_parallel.py @@ -18,6 +18,7 @@ torch.set_default_dtype(torch.double) +NO_NCCL = not hasattr(torch.distributed, "ProcessGroupNCCL") class TestDataParallel(TestCase): @@ -597,6 +598,25 @@ def test_scatter_cpu(self): def test_scatter_gpu(self): self._test_scatter(torch.randn((4, 4)).cuda()) + @unittest.skipIf(not TEST_MULTIGPU, "At least 2 CUDA GPUS needed") + @unittest.skipIf(NO_NCCL, "NCCL needed") + def test_data_parallel_complex(self): + # We expect complex parameters to be broadcast by view_as_real, e.g. move from C to R^2 + class Cplx(torch.nn.Module): + def __init__(self): + super().__init__() + self.cplx = torch.nn.Parameter(torch.zeros(1, 10, dtype=torch.cfloat).cuda()) + + def forward(self, x): + return x + self.cplx + + cplx = torch.nn.DataParallel(Cplx().cuda()) + input = torch.rand(1, 10, dtype=torch.cfloat).cuda() + result = cplx(input) + # 2 is the extra real view dimension here + self.assertEqual(result.size(), torch.Size([1, 10, 2])) + self.assertEqual(result, torch.view_as_real(input)) + def _test_gather(self, output_device): inputs = ( torch.randn(2, 4, device='cuda:0', requires_grad=True), diff --git a/torch/_utils.py b/torch/_utils.py index 6336e2d937d7..fbee17167b56 100644 --- a/torch/_utils.py +++ b/torch/_utils.py @@ -7,6 +7,7 @@ import traceback + def _type(self, dtype=None, non_blocking=False, **kwargs): """Returns the type if `dtype` is not provided, else casts this object to the specified type. @@ -491,3 +492,12 @@ def _get_device_index(device, optional=False, allow_cpu=False) -> int: raise ValueError('Expected a torch.device with a specified index ' 'or an integer, but got:{}'.format(device)) return device_idx + + +def _handle_complex(tensor): + """ + Returns a real view of a tensor if complex dtype else just the tensor + need to check if a UninitializedParameter because otherwise checking is_complex is an error for a LazyModule + """ + return torch.view_as_real(tensor) if not isinstance(tensor, + torch.nn.UninitializedParameter) and tensor.is_complex() else tensor diff --git a/torch/nn/parallel/comm.py b/torch/nn/parallel/comm.py index 331d3885bd30..dacd74a2fba0 100644 --- a/torch/nn/parallel/comm.py +++ b/torch/nn/parallel/comm.py @@ -2,10 +2,9 @@ import torch from torch.cuda import nccl from torch._utils import _take_tensors, _flatten_dense_tensors, \ - _unflatten_dense_tensors, _reorder_tensors_as, _get_device_index + _unflatten_dense_tensors, _reorder_tensors_as, _get_device_index, _handle_complex from typing import List - def broadcast(tensor, devices=None, *, out=None): r"""Broadcasts a tensor to specified GPU devices. @@ -27,6 +26,7 @@ def broadcast(tensor, devices=None, *, out=None): a tuple containing :attr:`out` tensors, each containing a copy of :attr:`tensor`. """ + tensor = _handle_complex(tensor) if not ((devices is None) ^ (out is None)): raise RuntimeError( "Exactly one of 'devices' and 'out' must be specified, but got " @@ -54,6 +54,7 @@ def broadcast_coalesced(tensors, devices, buffer_size=10485760): A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`. """ devices = [_get_device_index(d) for d in devices] + tensors = [_handle_complex(t) for t in tensors] return torch._C._broadcast_coalesced(tensors, devices, buffer_size) @@ -182,6 +183,7 @@ def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out= a tuple containing :attr:`out` tensors, each containing a chunk of :attr:`tensor`. """ + tensor = _handle_complex(tensor) if out is None: devices = [_get_device_index(d) for d in devices] return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams)) @@ -196,6 +198,7 @@ def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out= "but got chunk_sizes={}".format(chunk_sizes)) return tuple(torch._C._scatter_out(tensor, out, dim, streams)) + def gather(tensors, dim=0, destination=None, *, out=None): r"""Gathers tensors from multiple GPU devices. @@ -222,6 +225,7 @@ def gather(tensors, dim=0, destination=None, *, out=None): the :attr:`out` tensor, now containing results of concatenating :attr:`tensors` along :attr:`dim`. """ + tensors = [_handle_complex(t) for t in tensors] if out is None: if destination == -1: warnings.warn( From 4b26cafb8fa7eef7cbfdc0327f85f30e0a38e8ec Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Wed, 9 Dec 2020 08:09:08 -0800 Subject: [PATCH 070/250] make validate debug-only in Device copy ctr (#47854) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47854 Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D25003113 Pulled By: bdhirsh fbshipit-source-id: e17e6495db65c48c7daf3429acbd86742286a1f3 --- c10/core/Device.h | 8 ++++++-- test/test_torch.py | 4 ---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/c10/core/Device.h b/c10/core/Device.h index 7827119bb0ac..04cd711c37b2 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -93,9 +93,13 @@ struct C10_API Device final { DeviceType type_; DeviceIndex index_ = -1; void validate() { - TORCH_CHECK(index_ == -1 || index_ >= 0, + // Removing these checks in release builds noticeably improves + // performance in micro-benchmarks. + // This is safe to do, because backends that use the DeviceIndex + // have a later check when we actually try to switch to that device. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(index_ == -1 || index_ >= 0, "Device index must be -1 or non-negative, got ", (int)index_); - TORCH_CHECK(!is_cpu() || index_ <= 0, + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_cpu() || index_ <= 0, "CPU device index must be -1 or zero, got ", (int)index_); } }; diff --git a/test/test_torch.py b/test/test_torch.py index ad88128617c9..4b4e28583d02 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -341,9 +341,6 @@ def test_device(self): self.assertEqual(90, cuda90.index) self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1')) - self.assertRaises(RuntimeError, lambda: torch.device('cpu:1')) - self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1)) - self.assertRaises(RuntimeError, lambda: torch.device('cpu', 1)) self.assertRaises(RuntimeError, lambda: torch.device('cuda:-1')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2 ')) self.assertRaises(RuntimeError, lambda: torch.device('cuda: 2')) @@ -356,7 +353,6 @@ def test_device(self): self.assertRaises(RuntimeError, lambda: torch.device('cuda:2 cuda:3')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2+cuda:3')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2cuda:3')) - self.assertRaises(RuntimeError, lambda: torch.device('cuda', -1)) self.assertRaises(RuntimeError, lambda: torch.device(-1)) self.assertRaises(RuntimeError, lambda: torch.device('other')) From 73f7178445286b8ef133486e90092c1a04f8e0d8 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Wed, 9 Dec 2020 08:11:17 -0800 Subject: [PATCH 071/250] remove redundant sccache wrappers from build.sh scripts (#47944) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47944 Reviewed By: zhangguanheng66 Differential Revision: D25406873 Pulled By: walterddr fbshipit-source-id: 5441b0a304e0be1213b4e14adf26118b3e7e330b --- .jenkins/caffe2/build.sh | 43 --------------------------------------- .jenkins/pytorch/build.sh | 35 ++----------------------------- 2 files changed, 2 insertions(+), 76 deletions(-) diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 56ce8d525f89..0a4d1166bd05 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -18,49 +18,6 @@ build_to_cmake () { SCCACHE="$(which sccache)" -if [ "$(which gcc)" != "/root/sccache/gcc" ]; then - # Setup SCCACHE - ############################################################################### - # Setup sccache if SCCACHE_BUCKET is set - if [ -n "${SCCACHE_BUCKET}" ]; then - mkdir -p ./sccache - - SCCACHE="$(which sccache)" - if [ -z "${SCCACHE}" ]; then - echo "Unable to find sccache..." - exit 1 - fi - - # Setup wrapper scripts - wrapped="cc c++ gcc g++ x86_64-linux-gnu-gcc" - if [[ "${BUILD_ENVIRONMENT}" == *-cuda* ]]; then - wrapped="$wrapped nvcc" - fi - for compiler in $wrapped; do - ( - echo "#!/bin/sh" - - # TODO: if/when sccache gains native support for an - # SCCACHE_DISABLE flag analogous to ccache's CCACHE_DISABLE, - # this can be removed. Alternatively, this can be removed when - # https://github.com/pytorch/pytorch/issues/13362 is fixed. - # - # NOTE: carefully quoted - we want `which compiler` to be - # resolved as we execute the script, but SCCACHE_DISABLE and - # $@ to be evaluated when we execute the script - echo 'test $SCCACHE_DISABLE && exec '"$(which $compiler)"' "$@"' - - echo "exec $SCCACHE $(which $compiler) \"\$@\"" - ) > "./sccache/$compiler" - chmod +x "./sccache/$compiler" - done - - export CACHE_WRAPPER_DIR="$PWD/sccache" - - # CMake must find these wrapper scripts - export PATH="$CACHE_WRAPPER_DIR:$PATH" - fi -fi # Setup ccache if configured to use it (and not sccache) if [ -z "${SCCACHE}" ] && which ccache > /dev/null; then diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index e14828dc5afd..55b63d2144d0 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -ex + # Required environment variable: $BUILD_ENVIRONMENT # (This is set by default in the Docker images we build, so you don't # need to set it yourself. @@ -7,13 +9,6 @@ # shellcheck disable=SC2034 COMPACT_JOB_NAME="${BUILD_ENVIRONMENT}" -# Temp: use new sccache -if [[ -n "$IN_CI" && "$BUILD_ENVIRONMENT" == *rocm* ]]; then - # Download customized sccache - sudo curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache - sudo chmod 755 /opt/cache/bin/sccache -fi - source "$(dirname "${BASH_SOURCE[0]}")/common.sh" if [[ "$BUILD_ENVIRONMENT" == *-linux-xenial-py3-clang5-asan* ]]; then @@ -124,32 +119,6 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then export MAX_JOBS=$(($(nproc) - 1)) fi - # ROCm CI is using Caffe2 docker images, which needs these wrapper - # scripts to correctly use sccache. - if [[ -n "${SCCACHE_BUCKET}" && -z "$IN_CI" ]]; then - mkdir -p ./sccache - - SCCACHE="$(which sccache)" - if [ -z "${SCCACHE}" ]; then - echo "Unable to find sccache..." - exit 1 - fi - - # Setup wrapper scripts - for compiler in cc c++ gcc g++ clang clang++; do - ( - echo "#!/bin/sh" - echo "exec $SCCACHE $(which $compiler) \"\$@\"" - ) > "./sccache/$compiler" - chmod +x "./sccache/$compiler" - done - - export CACHE_WRAPPER_DIR="$PWD/sccache" - - # CMake must find these wrapper scripts - export PATH="$CACHE_WRAPPER_DIR:$PATH" - fi - if [[ -n "$IN_CI" ]]; then # Set ROCM_ARCH to gfx900 and gfx906 for CI builds echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds" From 9f7fb546937a8968b360e878879d12816cd88d5a Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Wed, 9 Dec 2020 08:35:08 -0800 Subject: [PATCH 072/250] Revert D25111515: Extra sampling of record function events Test Plan: revert-hammer Differential Revision: D25111515 (https://github.com/pytorch/pytorch/commit/09b974c2d5e25432d0e4c676ee16cecdea20c7e5) Original commit changeset: 0d572a3636fe fbshipit-source-id: d558d8052924d937d86db7dd40dc6388e6d28823 --- aten/src/ATen/ThreadLocalState.cpp | 1 - aten/src/ATen/ThreadLocalState.h | 24 +---- aten/src/ATen/core/dispatch/Dispatcher.h | 81 +++++++--------- aten/src/ATen/record_function.cpp | 114 +++++------------------ aten/src/ATen/record_function.h | 27 +----- binaries/record_function_benchmark.cc | 101 ++++++++++---------- torch/csrc/autograd/function.h | 39 ++++---- torch/csrc/jit/runtime/interpreter.cpp | 5 +- 8 files changed, 127 insertions(+), 265 deletions(-) diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 3c7b9b6ff5bc..6d74e2f47ce0 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -19,7 +19,6 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode) grad_mode_enabled_ = GradMode::is_enabled(); } #endif - bumped_record_all_functions_ = at::checkRecordAllFunctions(); } /* static */ diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index 3c9b55b3d8d6..f0cb85f0ff84 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -38,9 +38,6 @@ class TORCH_API ThreadLocalState { bool grad_mode_enabled_; #endif - // Whether pre-sampling RecordFunction optimization was enabled - bool bumped_record_all_functions_ = false; - friend class ThreadLocalStateGuard; }; @@ -48,21 +45,7 @@ class TORCH_API ThreadLocalState { class TORCH_API ThreadLocalStateGuard { public: explicit ThreadLocalStateGuard(const ThreadLocalState& state) - : prev_state_(ThreadLocalState()), - bumped_record_all_functions_(state.bumped_record_all_functions_) { - // Special handling of RecordFunction pre-sampling optimization: - // pre-samping is enabled (bumped) when there're non-sampled - // (or high-frequency) global or TLS callbacks. - // - // ThreadLocalStateGuard simply resets RecordFunction's TLS and - // hence its thread local callbacks. - // - // Checking if the pre-sampling was enabled and preserving it in the - // async task by calling bumpRecordAllFunctions() and the corresponding - // releaseRecordAllFunctions() - if (bumped_record_all_functions_) { - at::bumpRecordAllFunctions(); - } + : prev_state_(ThreadLocalState()) { // set the given state across the thread boundary ThreadLocalState::setThreadLocalState(state); } @@ -70,15 +53,10 @@ class TORCH_API ThreadLocalStateGuard { ~ThreadLocalStateGuard() { // restore previously set variables ThreadLocalState::setThreadLocalState(prev_state_); - if (bumped_record_all_functions_) { - at::releaseRecordAllFunctions(); - } } private: const ThreadLocalState prev_state_; - // Whether pre-sampling RecordFunction optimization was enabled - bool bumped_record_all_functions_ = false; }; template diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index f83302e2d819..632739053c42 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -371,39 +371,28 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandleop.lookup(dispatchKey); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - // By default, when there're no high-frequency or non-sampled callbacks, - // RecordFunction is pre-sampled as a perf optimization; - // shouldRunRecordFunction checks whether RecordFunction should be executed, - // and sets pre_sampled boolean argument value to whether pre-sampling was used - - // this boolean is passed into RecordFunction to adjust the sampling rates of - // the callbacks - bool pre_sampled = false; - if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { - // Check if we need to run callbacks registered with RecordFunction - // If true and callbacks need inputs, we box the arguments and pass - // them into the callbacks and also into the kernel call - - // Note: for perf reasons we wouldn't want to pass arguments into - // the function call or prematurely box them - at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); - if (C10_UNLIKELY(guard.isActive())) { - if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) { - int64_t seq_num = -1; - // Setting sequence number in the Autograd case to associate - // the forward range with the coresponding Autograd's node - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { - seq_num = at::sequence_number::peek(); - } - if (guard.needsInputs()) { - torch::jit::Stack stack = impl::boxArgs(args...); - guard.before(op, stack, seq_num); - } else { - guard.before(op, seq_num); - } + // Check if we need to run callbacks registered with RecordFunction + // If true and callbacks need inputs, we box the arguments and pass + // them into the callbacks and also into the kernel call + + // Note: for perf reasons we wouldn't want to pass arguments into + // the function call or prematurely box them + at::RecordFunction guard(at::RecordScope::FUNCTION); + if (C10_UNLIKELY(guard.isActive())) { + if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) { + int64_t seq_num = -1; + // Setting sequence number in the Autograd case to associate + // the forward range with the coresponding Autograd's node + if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + seq_num = at::sequence_number::peek(); + } + if (guard.needsInputs()) { + torch::jit::Stack stack = impl::boxArgs(args...); + guard.before(op, stack, seq_num); + } else { + guard.before(op, seq_num); } } - // keeping the guard alive while executing the kernel - return kernel.template call(op, std::forward(args)...); } #endif // PYTORCH_DISABLE_PER_OP_PROFILING return kernel.template call(op, std::forward(args)...); @@ -440,26 +429,20 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const const auto& kernel = entry.lookup(dispatchKey); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - bool pre_sampled = false; - if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { - // using already existing stack to record function execution in observers - at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); - if (C10_UNLIKELY(guard.isActive())) { - if (shouldRecord(dispatchKey) && entry.isObserved()) { - int64_t seq_num = -1; - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { - seq_num = at::sequence_number::peek(); - } - if (guard.needsInputs()) { - guard.before(op, *stack, seq_num); - } else { - guard.before(op, seq_num); - } + // using already existing stack to record function execution in observers + at::RecordFunction guard(at::RecordScope::FUNCTION); + if (C10_UNLIKELY(guard.isActive())) { + if (shouldRecord(dispatchKey) && entry.isObserved()) { + int64_t seq_num = -1; + if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + seq_num = at::sequence_number::peek(); + } + if (guard.needsInputs()) { + guard.before(op, *stack, seq_num); + } else { + guard.before(op, seq_num); } } - // keeping the guard alive while executing the kernel - kernel.callBoxed(op, stack); - return; } #endif // PYTORCH_DISABLE_PER_OP_PROFILING kernel.callBoxed(op, stack); diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index 32897dd7783f..102931fd4aa7 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -30,6 +30,8 @@ std::atomic defaultNodeId(-1); std::atomic next_thread_id_ {0}; thread_local uint64_t current_thread_id_ = 0; +thread_local bool tls_record_function_enabled_ = true; + // Low probability constant static const double kLowProb = 0.001; struct CoinflipTLS { @@ -66,10 +68,6 @@ void set_record_function_tls_(const RecordFunctionTLS& tls) { class CallbackManager { public: CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) { - if (cb.samplingProb() > kLowProb) { - // pre-sampling of RecordFunction with prob. kLowProb cannot be used - at::bumpRecordAllFunctions(); - } // note: monotonically increasing callbacks_unique_id keeps // sorted_tls_callbacks_ sorted auto handle = next_unique_callback_handle(); @@ -78,10 +76,6 @@ class CallbackManager { } CallbackHandle addGlobalCallback(RecordFunctionCallback cb) { - if (cb.samplingProb() > kLowProb) { - // pre-sampling of RecordFunction with prob. kLowProb cannot be used - at::bumpRecordAllFunctions(); - } auto handle = next_unique_callback_handle(); sorted_global_callbacks_.emplace_back(std::move(cb), handle); return handle; @@ -98,10 +92,6 @@ class CallbackManager { return el.second == handle; }); if (it != cbs.end()) { - if (it->first.samplingProb() > kLowProb) { - // try to restore pre-sampling of RecordFunction - at::releaseRecordAllFunctions(); - } // keeps it sorted cbs.erase(it); return true; @@ -137,13 +127,7 @@ class CallbackManager { // callbackShouldRun is even hotter because it's called multiple // times per init(). Profiling shows that the function prologue is // taking up a significant fraction of the time. - static bool C10_ALWAYS_INLINE callbackShouldRun( - const RecordFunctionCallback& cb, RecordScope scope, bool pre_sampled) { - TORCH_INTERNAL_ASSERT( - !pre_sampled || (cb.sampling_prob_ <= kLowProb), - "Incorrect usage of a pre-sampled RecordFunction with a high-frequency " - " or non-sampled callback"); - + static bool C10_ALWAYS_INLINE callbackShouldRun(const RecordFunctionCallback& cb, RecordScope scope) { // first check whether this callback is interested in // the given scope type if (!cb.checkScope(scope)) { @@ -154,45 +138,36 @@ class CallbackManager { return cb.should_run_(cb); } - // otherwise potentially do the sampling - double sampling_prob = cb.sampling_prob_; - if (pre_sampled) { - // adjust the sampling rate to account for kLowProb pre-sampling of - // the RecordFunction - sampling_prob /= kLowProb; + if (cb.sampling_prob_ == 1.0) { + return true; } - - if (sampling_prob < 1.0) { - // model the low probability events as events happening - // with probability kLowProb followed by another sampling with - // probability (sampling_prob / kLowProb), then replace the coin - // flip for kLowProb with a thread local number of tries tries_left_ - // sampled from the geometric distribution. - if (sampling_prob < kLowProb) { - if (coinflip_tls_.tries_left_ == 0) { - coinflip_tls_.tries_left_ = sample_geometric(); - return (sample_zero_one() < sampling_prob / kLowProb); - } else { - --coinflip_tls_.tries_left_; - return false; - } + // model the low probability events as events happening + // with probability kLowProb followed by another sampling with + // probability (sampling_prob__ / kLowProb), then replace the coin + // flip for kLowProb with a thread local number of tries tries_left_ + // sampled from the geometric distribution. + if (cb.sampling_prob_ < kLowProb) { + if (coinflip_tls_.tries_left_ == 0) { + coinflip_tls_.tries_left_ = sample_geometric(); + return (sample_zero_one() < cb.sampling_prob_ / kLowProb); } else { - return (sample_zero_one() < sampling_prob); + --coinflip_tls_.tries_left_; + return false; } + } else { + return (sample_zero_one() < cb.sampling_prob_); } - - return true; } // init is called by RecordFunction in constructor to // determine which thread local and global callbacks are going // to be executed and whether any of them need inputs - inline void init(RecordFunction& rec_fn, RecordScope scope, bool pre_sampled) { + inline void init(RecordFunction& rec_fn, RecordScope scope) { bool found_needs_inputs = false; bool found_needs_ids = false; for (const auto& cb: rf_tls_.sorted_tls_callbacks_) { - if (callbackShouldRun(cb.first, scope, pre_sampled)) { + if (callbackShouldRun(cb.first, scope)) { if (cb.first.needsInputs()) { found_needs_inputs = true; } @@ -207,7 +182,7 @@ class CallbackManager { } for (const auto& cb: sorted_global_callbacks_) { - if (callbackShouldRun(cb.first, scope, pre_sampled)) { + if (callbackShouldRun(cb.first, scope)) { if (cb.first.needsInputs()) { found_needs_inputs = true; } @@ -333,6 +308,7 @@ namespace { } } // namespace + RecordFunctionCallbacks _getTLSCallbacks() { return rf_tls_.sorted_tls_callbacks_; } @@ -398,12 +374,12 @@ void enableRecordFunction(bool enable) { rf_tls_.tls_record_function_enabled_ = enable; } -RecordFunction::RecordFunction(RecordScope scope, bool pre_sampled) { +RecordFunction::RecordFunction(RecordScope scope) { auto* rf_tls_ptr = &rf_tls_; if (rf_tls_ptr->tls_record_function_enabled_) { auto& m = manager(); if (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty()) { - m.init(*this, scope, pre_sampled); + m.init(*this, scope); } } } @@ -475,46 +451,4 @@ void RecordFunction::end() { } } -// RecordFunction pre-sampling -namespace { -// Whether to try to create RecordFunction on each call (>0) or -// use pre-sampling (=0) -std::atomic global_record_all_functions_ {0}; -} - -void bumpRecordAllFunctions() { - global_record_all_functions_.fetch_add(1, std::memory_order_relaxed); -} - -void releaseRecordAllFunctions() { - TORCH_CHECK(global_record_all_functions_.fetch_sub(1, std::memory_order_relaxed) >= 0); -} - -bool checkRecordAllFunctions() { - return (global_record_all_functions_.load(std::memory_order_relaxed) > 0); -} - -bool shouldRunRecordFunction(bool* pre_sampled) { - auto* rf_tls_ptr = &rf_tls_; - if (!rf_tls_ptr->tls_record_function_enabled_) { - *pre_sampled = false; - return false; - } - - if (global_record_all_functions_.load(std::memory_order_relaxed) > 0) { - *pre_sampled = false; - return true; - } - - *pre_sampled = true; - auto* coinflip_tls_ptr = &coinflip_tls_; - if (coinflip_tls_ptr->tries_left_ == 0) { - coinflip_tls_ptr->tries_left_ = sample_geometric(); - return true; - } else { - --coinflip_tls_ptr->tries_left_; - return false; - } -} - } // namespace at diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index 843e3850d498..4b07d13aa747 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -90,11 +90,8 @@ typedef uint64_t RecordFunctionHandle; struct TORCH_API RecordFunction { // Default constructor is used with before function called afterwards: // scope - record scope that this function tracks - // pre_sampled - whether this RecordFunction was already pre-sampled with - // kLowProb probability RecordFunction( - RecordScope scope = RecordScope::FUNCTION, - bool pre_sampled = false); + RecordScope scope = RecordScope::FUNCTION); template void before( @@ -241,9 +238,6 @@ struct TORCH_API RecordFunction { // flag is used to check whether the start callbacks were called bool called_start_callbacks_ = false; - // Whether the RecordFunction is pre-sampled - bool pre_sampled_ = false; - // Used internally to keep track of thread local and global callbacks // that were picked to run; must be sorted; CallbackHandles sorted_active_tls_handles_; @@ -336,7 +330,7 @@ class TORCH_API RecordFunctionCallback { } RecordFunctionCallback& samplingProb(double sampling_prob) { - TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0, + TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob_ <= 1.0, "Invalid sampling probability"); sampling_prob_ = sampling_prob; return *this; @@ -550,27 +544,10 @@ struct TORCH_API RecordFunctionTLS { RecordFunctionCallbacks sorted_tls_callbacks_; bool tls_record_function_enabled_ = true; - - // Stores the number of coin flips before the next successful coin flip - int tries_left_ = 0; }; TORCH_API const RecordFunctionTLS& get_record_function_tls_(); TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls); -// Checks whether RecordFunction should be called, -// sets boolean pointed by the argument to whether pre-sampling was used -TORCH_API bool shouldRunRecordFunction(bool*); - -// The following functions are used to disable/enable pre-sampling of RecordFunction -// when high-frequency/non-sampled callbacks are added/removed. -// Note: every call to bumpRecordAllFunctions() is supposed to be matched with -// the corresponding releaseRecordAllFunctions() call. -// Note: disabling pre-sampling of RecordFunction incurs an extra overhead, since -// RecordFunction will be created for each operator call. -TORCH_API void bumpRecordAllFunctions(); -TORCH_API void releaseRecordAllFunctions(); -TORCH_API bool checkRecordAllFunctions(); - } // namespace at diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index 53a8bd16f43d..d924003b9270 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -7,55 +7,61 @@ #include #include -C10_DEFINE_int(iter, 10000, "Number of iterations"); +C10_DEFINE_int(iter, 100, "Number of iterations"); +C10_DEFINE_int(warmup_iter, 10, "Number of warmup iterations"); C10_DEFINE_int(sampled_iter, 10e6, "Number of iterations for the sampled observer benchmark"); namespace { +const int kInnerIter = 100; +const int kNumSampledCb = 2; const int kTensorSize = 16; const int kSmallTensorSize = 1; +const float kSampingProb = 0.1; + const float kLowSamplingProb = 0.0001; } -void addTestCallback( - double sampling_prob = 1.0, - std::function fn = - [](const at::RecordFunction&) {}) { - auto cb = at::RecordFunctionCallback( - std::move(fn), +void setupBenchmarkCallbacks() { + at::enableRecordFunction(); + at::clearCallbacks(); + // non-sampled callback + at::addGlobalCallback(at::RecordFunctionCallback( + [&](const at::RecordFunction& fn) {}, [](const at::RecordFunction&) {}) - .needsInputs(false); - if (sampling_prob < 1.0) { - cb.samplingProb(sampling_prob); + .needsInputs(true)); + + // sampled + for (auto idx = 0; idx < kNumSampledCb; ++idx) { + at::addGlobalCallback(at::RecordFunctionCallback( + [](const at::RecordFunction& fn) {}, + [](const at::RecordFunction&) {}) + .needsInputs(true) + .samplingProb(kSampingProb) + ); } - at::addGlobalCallback(cb); } -float runTensorGEMMBench(int tensor_size, int iter) { +float runTensorBench(int tensor_size, int outer_iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); - auto inp = torch::randn({tensor_size, tensor_size}); - for (auto idx = 0; idx < iter; ++idx) { - torch::mm(inp, inp); + for (auto idx = 0; idx < kInnerIter * outer_iter; ++idx) { + torch::mm( + torch::randn({tensor_size, tensor_size}), + torch::randn({tensor_size, tensor_size})); } auto duration = static_cast( std::chrono::duration_cast(clock::now() - start_time).count()); return duration; } -float runPureRecordFunctionBench(int iter) { +float runPureRecordFunctionBench(int outer_iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); - for (auto idx = 0; idx < iter; ++idx) { - bool pre_sampled = false; - if (at::shouldRunRecordFunction(&pre_sampled)) { - at::RecordFunction guard(at::RecordScope::USER_SCOPE, pre_sampled); - if (C10_UNLIKELY(guard.isActive())) { - guard.before("Test", -1); - } - } + for (auto n = 0; n < outer_iter; ++n) { + RECORD_USER_SCOPE("test"); } auto duration = static_cast( std::chrono::duration_cast(clock::now() - start_time).count()); @@ -65,19 +71,18 @@ float runPureRecordFunctionBench(int iter) { void runBenchmark() { float duration = 0; for (auto tensor_size : std::set({kSmallTensorSize, kTensorSize})) { - duration = runTensorGEMMBench(tensor_size, FLAGS_iter); - std::cout << "Tensor GEMM benchmark (" + duration = runTensorBench(tensor_size, FLAGS_iter); + std::cout << "Running tensor benchmark, time per iteration (" << tensor_size << "x" << tensor_size - << ", " << FLAGS_iter << "): " << duration + << "): " << (duration/FLAGS_iter) << " us." << std::endl; } - duration = runPureRecordFunctionBench(FLAGS_iter); - std::cout << "Pure RecordFunction benchmark (" - << FLAGS_iter << "): " - << duration - << " us." << std::endl; + duration = runPureRecordFunctionBench(FLAGS_iter * 100); + std::cout << "Running pure RecordFunction benchmark, time per iteration: " + << (duration/FLAGS_iter) + << " us." << std::endl; } int main(int argc, char** argv) { @@ -86,38 +91,32 @@ int main(int argc, char** argv) { return -1; } - at::enableRecordFunction(); - at::clearCallbacks(); - - std::cout << "Warm up" << std::endl; - runBenchmark(); + auto duration = runTensorBench(kSmallTensorSize, FLAGS_warmup_iter); + std::cout << "Warmup time: " << duration << " us." << std::endl; - std::cout << "Running without observers" << std::endl; + setupBenchmarkCallbacks(); + std::cout << "Running with empty observers" << std::endl; runBenchmark(); - addTestCallback(); - std::cout << "Running with empty non-sampled observer" << std::endl; - runBenchmark(); at::clearCallbacks(); - - addTestCallback(kLowSamplingProb); - std::cout << "Running with empty sampled observer" << std::endl; + std::cout << "Running without observers" << std::endl; runBenchmark(); - at::clearCallbacks(); - std::cout << "Checking number of sampled observer invocations" << std::endl; + std::cout << "Running sampled observer benchmark" << std::endl; int cb_count = 0; - addTestCallback( - kLowSamplingProb, + at::addGlobalCallback(at::RecordFunctionCallback( [&](const at::RecordFunction& fn) { ++cb_count; - } + }, + [](const at::RecordFunction&) {}) + .needsInputs(true) + .samplingProb(kLowSamplingProb) ); - auto duration = runPureRecordFunctionBench(FLAGS_sampled_iter); + runPureRecordFunctionBench(FLAGS_sampled_iter); std::cout << "Pure RecordFunction runtime of " << FLAGS_sampled_iter - << " iterations: " << duration + << " iterations " << duration << " us, number of callback invocations: " << cb_count << ", expected number: ~" << (int)(FLAGS_sampled_iter * kLowSamplingProb) << " invocations" << std::endl; diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 44171e1a3b1b..09dc048f214b 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -133,33 +133,26 @@ struct TORCH_API Node : std::enable_shared_from_this { /// Evaluates the function on the given inputs and returns the result of the /// function call. variable_list operator()(variable_list&& inputs) { + // Using RecordFunction to trogger observers in the backward pass + at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION); + if (guard.isActive()) { + // Using sequence number and thread id to correlate with + // the forward pass function + guard.setForwardThreadId(thread_id_); + if (guard.needsInputs()) { + guard.before( + name(), + std::vector(inputs.begin(), inputs.end()), + sequence_nr()); + } else { + guard.before(name(), sequence_nr()); + } + } // In the first iteration of named tensors, autograd ignores names and // operates on unnamed tensors. In the long term, autograd should // probably operate with names. at::NoNamesGuard no_names_guard; - - bool pre_sampled = false; - if (at::shouldRunRecordFunction(&pre_sampled)) { - // Using RecordFunction to trogger observers in the backward pass - at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION, pre_sampled); - if (guard.isActive()) { - // Using sequence number and thread id to correlate with - // the forward pass function - guard.setForwardThreadId(thread_id_); - if (guard.needsInputs()) { - guard.before( - name(), - std::vector(inputs.begin(), inputs.end()), - sequence_nr()); - } else { - guard.before(name(), sequence_nr()); - } - } - // keeping stack guard object alive during the call - return apply(std::move(inputs)); - } else { - return apply(std::move(inputs)); - } + return apply(std::move(inputs)); } // Graph Connectivity API diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index d07da80e4cd3..ef0f2dae9e0e 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1607,11 +1607,10 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { } static void checkAndStartRecordFunction(Frame& frame, Stack& stack) { - bool pre_sampled = false; if (!frame.record_function && at::hasCallbacks() && - at::shouldRunRecordFunction(&pre_sampled)) { + at::isRecordFunctionEnabled()) { auto rec_fn = std::make_unique( - at::RecordScope::TORCHSCRIPT_FUNCTION, pre_sampled); + at::RecordScope::TORCHSCRIPT_FUNCTION); if (rec_fn->isActive()) { if (rec_fn->needsInputs()) { rec_fn->before( From 492580b855a1e2c5d339a7468314e30bb6c81537 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Wed, 9 Dec 2020 10:08:09 -0800 Subject: [PATCH 073/250] [te] Remove vestigial __init__.py from test/cpp/tensorexpr (#49061) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49061 We don't use the python harness for cpp tests anymore. ghstack-source-id: 118140485 Test Plan: Careful thinking. Reviewed By: navahgar Differential Revision: D25410290 fbshipit-source-id: 879e3c6fb296298d567e1d70b18bde96b5cac90d --- test/cpp/tensorexpr/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test/cpp/tensorexpr/__init__.py diff --git a/test/cpp/tensorexpr/__init__.py b/test/cpp/tensorexpr/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 From b3ab25aefaffa3f14031a9ddf92e6a0451ca70e7 Mon Sep 17 00:00:00 2001 From: kshitij12345 Date: Wed, 9 Dec 2020 10:08:24 -0800 Subject: [PATCH 074/250] [numpy] `torch.cosh`: promote integer inputs to float (#48923) Summary: Reference: https://github.com/pytorch/pytorch/issues/42515 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48923 Reviewed By: zhangguanheng66 Differential Revision: D25393679 Pulled By: mruberry fbshipit-source-id: 2151ee0467b50175f84ac492c219a46ef6bd66c3 --- aten/src/ATen/native/UnaryOps.cpp | 4 ++-- aten/src/ATen/native/cuda/UnaryGeometricKernels.cu | 2 +- torch/csrc/jit/tensorexpr/kernel.cpp | 5 +++-- torch/testing/_internal/common_methods_invocations.py | 9 +++++++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 900f5ee72f7a..9522d2a1e271 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -347,8 +347,8 @@ Tensor& sinh_out(Tensor& result, const Tensor& self) { return unary_op_impl_floa Tensor sinh(const Tensor& self) { return unary_op_impl_float(self, sinh_stub); } Tensor& sinh_(Tensor& self) { return unary_op_impl_(self, at::sinh_out); } -Tensor& cosh_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, cosh_stub); } -Tensor cosh(const Tensor& self) { return unary_op_impl(self, at::cosh_out); } +Tensor& cosh_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, cosh_stub); } +Tensor cosh(const Tensor& self) { return unary_op_impl_float(self, cosh_stub); } Tensor& cosh_(Tensor& self) { return unary_op_impl_(self, at::cosh_out); } Tensor& acosh_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, acosh_stub); } diff --git a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu index 2488528f5e2c..867855217092 100644 --- a/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu +++ b/aten/src/ATen/native/cuda/UnaryGeometricKernels.cu @@ -59,7 +59,7 @@ void sinh_kernel_cuda(TensorIterator& iter) { } void cosh_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.dtype(), "cosh_cuda", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, iter.common_dtype(), "cosh_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { return ::cosh(a); }); diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 50f285104d95..ea28264246ec 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1143,8 +1143,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { } break; case aten::cosh: { - return computeOneOperand( - "aten_cosh", v, [](const ExprHandle& a) { return cosh(a); }); + return computeOneOperand("aten_cosh", v, [](const ExprHandle& a) { + return cosh(promoteIntegerToFloat(a)); + }); } break; case aten::sinh: { diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 26be9c9fde3a..9fc31f6caed4 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -518,10 +518,15 @@ def sample_inputs(self, device, dtype, requires_grad=False): dtypes=[torch.float], active_if=TEST_WITH_ROCM), )), UnaryUfuncInfo('cosh', - ref=np.cosh, - dtypesIfCPU=floating_and_complex_types(), + ref=np_unary_ufunc_integer_promotion_wrapper(np.cosh), + dtypesIfCPU=all_types_and_complex_and(torch.bool), + dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half), + promotes_integers_to_float=True, assert_autodiffed=True, skips=( + # Reference: https://github.com/pytorch/pytorch/issues/48641 + SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', + device_type='cpu', dtypes=[torch.int8]), SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', dtypes=[torch.cfloat, torch.cdouble], active_if=IS_WINDOWS), SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', device_type='cpu', From 41fd51d7d8292a5a4941a33eea7e2ea70467e86a Mon Sep 17 00:00:00 2001 From: Martin Yuan Date: Wed, 9 Dec 2020 10:36:03 -0800 Subject: [PATCH 075/250] [PyTorch] Reference to c10::GetCPUAllocator() directly (#49068) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49068 TH folder has some kernal implementations referenced by ATen/native. It goes with ATen/native in the follow-up diff for per-app selective build. ATen/Context.cpp stays in the lib level and should not reference to symbols in TH directly. It's a simple change in this diff, as ```getTHDefaultAllocator()``` did nothing but returns ```c10::GetCPUAllocator()```. Use ```c10::GetCPUAllocator()``` instead of going extra route through ```getTHDefaultAllocator()```. ghstack-source-id: 118151905 Test Plan: CI Reviewed By: dhruvbird Differential Revision: D24147914 fbshipit-source-id: 37efb43adc9b491c365df0910234fa6a8a34ec25 --- aten/src/ATen/Context.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index 1977f945a0fb..e17322e1681d 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -232,7 +233,7 @@ bool Context::setFlushDenormal(bool on) { } Allocator* getCPUAllocator() { - return getTHDefaultAllocator(); + return c10::GetCPUAllocator(); } // override_allow_tf32_flag = true From e5a98c5ab03ba168ccb37edb2039a7da85c71620 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 9 Dec 2020 11:35:49 -0800 Subject: [PATCH 076/250] [ONNX] Remove usage of isCompleteTensor() in symbolic functions (#48162) Summary: `isCompleteTensor()` only returns true when both scalar type and shape is present. All dimensions in the shape must be static. This high requirement is unnecessary for many use cases such as when only rank or scalar type needs to be known. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48162 Reviewed By: malfet Differential Revision: D25340823 Pulled By: bzinodev fbshipit-source-id: 1fef61f44918f4339dd6654fb725b18cd58d99cf --- test/onnx/test_pytorch_onnx_onnxruntime.py | 45 +++- torch/csrc/jit/python/python_ir.cpp | 10 + torch/onnx/symbolic_helper.py | 28 ++- torch/onnx/symbolic_opset10.py | 7 +- torch/onnx/symbolic_opset11.py | 60 +++--- torch/onnx/symbolic_opset8.py | 9 +- torch/onnx/symbolic_opset9.py | 235 +++++++++++++-------- 7 files changed, 262 insertions(+), 132 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index e2e12af88c1e..61c0fd9dc384 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -754,7 +754,10 @@ def forward(self, x): return x.transpose(0, 1) x = torch.randn(32, 3, 64, 64) - self.run_test(TransposeModule(), x) + y = torch.randn(16, 3, 8, 64) + self.run_test(TransposeModule(), x, input_names=['x'], + dynamic_axes={'x': [0, 2]}, + test_with_inputs=[y]) def squeeze_model_tests(self, d, x1, x2): class Squeeze(torch.nn.Module): @@ -841,7 +844,10 @@ def forward(self, x): def test_maxpool_adaptive(self): model = torch.nn.AdaptiveMaxPool1d((5), return_indices=False) x = torch.randn(20, 16, 50, requires_grad=True) - self.run_test(model, x) + y = torch.randn(32, 16, 50, requires_grad=True) + self.run_test(model, x, input_names=['x'], + dynamic_axes={'x' : [0]}, + test_with_inputs=[y]) def test_maxpool_2d(self): model = torch.nn.MaxPool2d(5, padding=(1, 2)) @@ -903,7 +909,10 @@ def test_avgpool_2d_ceil(self): def test_avgpool_3d_ceil(self): model = torch.nn.AvgPool3d(3, 2, ceil_mode=True) x = torch.randn(20, 16, 50, 44, 31) - self.run_test(model, x) + y = torch.randn(32, 8, 50, 44, 31) + self.run_test(model, x, input_names=['x'], + dynamic_axes={'x' : [0, 1]}, + test_with_inputs=[y]) @skipIfUnsupportedMinOpsetVersion(9) def test_floating_point(self): @@ -3809,7 +3818,11 @@ def forward(self, x): return x.unfold(dimension=2, size=2, step=2) x = torch.randn(4, 2, 3, requires_grad=True) - self.run_test(UnfoldModel(), x) + y = torch.randn(2, 1, 3, requires_grad=True) + self.run_test(UnfoldModel(), x, + dynamic_axes={'x': [0, 1]}, + input_names=['x'], + test_with_inputs=[y]) @skipIfONNXShapeInference(False) def test_unfold_infer_shape(self): @@ -3826,6 +3839,21 @@ def forward(self, x): x = torch.randn(32, 3, 64) self.run_test(UnfoldModule(), x) + def test_prelu(self): + class PReluModel(torch.nn.Module): + def __init__(self): + super(PReluModel, self).__init__() + self.prelu = torch.nn.PReLU() + + def forward(self, x): + return self.prelu(x) + + x = torch.randn(2, 3, 4) + y = torch.randn(2, 4, 5) + self.run_test(PReluModel(), x, input_names=['x'], + dynamic_axes={'x': [1, 2]}, + test_with_inputs=[y]) + def test_remainder(self): class RemainderModel(torch.nn.Module): def forward(self, input, other): @@ -3862,6 +3890,15 @@ def forward(self, input): x = torch.randint(10, (2, 3)) self.run_test(FModModel(), x) + @skipIfUnsupportedMinOpsetVersion(9) + def test_glu(self): + class GluModel(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.glu(x) + + x = torch.randn(2, 4, 5, 6, requires_grad=True) + self.run_test(GluModel(), x) + @skipIfUnsupportedMinOpsetVersion(9) def test_gelu(self): class GeluModel(torch.nn.Module): diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp index f5cdea1e7eb4..6e68fe9ebec3 100644 --- a/torch/csrc/jit/python/python_ir.cpp +++ b/torch/csrc/jit/python/python_ir.cpp @@ -697,6 +697,16 @@ void initPythonIRBindings(PyObject* module_) { } return py::none(); }) + .def( + "varyingSizes", + [](Type& t) -> py::object { + if (auto ptt = t.expect()) { + if (auto s = ptt->sizes().sizes()) { + return py::cast(s.value()); + } + } + return py::none(); + }) .def( "strides", [](Type& t) -> py::object { diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index 10250baf131a..4cc3f47a3541 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -178,6 +178,29 @@ def _is_tensor(x): def _is_tensor_list(x): return isinstance(x.type(), torch._C.ListType) and isinstance(x.type().getElementType(), torch._C.TensorType) +def _get_tensor_rank(x): + if not _is_tensor(x) or x.type() is None: + return None + return x.type().dim() + +def _get_tensor_sizes(x, allow_nonstatic=True): + if not _is_tensor(x) or x.type() is None: + return None + if allow_nonstatic: + # Each individual symbol is returned as None. + # e.g. [1, 'a', 'b'] -> [1, None, None] + return x.type().varyingSizes() + # returns None, if exists any symbol in sizes. + # e.g. [1, 'a', 'b'] -> None + return x.type().sizes() + +def _get_tensor_dim_size(x, dim): + try: + sizes = _get_tensor_sizes(x) + return sizes[dim] + except Exception: + pass + return None def _unimplemented(op, msg): warnings.warn("ONNX export failed on " + op + " because " + msg + " not supported") @@ -216,7 +239,7 @@ def _try_get_scalar_type(*args): def _select_helper(g, self, dim, index, apply_reshape=True): index_const = _maybe_get_scalar(index) - index_dim = index.type().dim() + index_dim = _get_tensor_rank(index) if not _is_value(index_const): # Index is a constant scalar. Make it a size 1 constant tensor. index = g.op("Constant", value_t=torch.LongTensor([index_const])) @@ -344,7 +367,8 @@ def _get_interpolate_attributes(g, mode, args): def _interpolate_get_scales(g, scale_factor, dim): offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32)) - if isinstance(scale_factor.type(), torch._C.ListType) or (scale_factor.isCompleteTensor() and scale_factor.type().dim() > 0): + scale_factor_rank = _get_tensor_rank(scale_factor) + if isinstance(scale_factor.type(), torch._C.ListType) or (scale_factor_rank is not None and scale_factor_rank > 0): return g.op("Concat", offsets, scale_factor, axis_i=0) else: scale_factor = _unsqueeze_helper(g, scale_factor, 0) diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py index 718b30f8fde3..6558df6e3d4c 100644 --- a/torch/onnx/symbolic_opset10.py +++ b/torch/onnx/symbolic_opset10.py @@ -209,12 +209,13 @@ def embedding_bag(g, import warnings warnings.warn("Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. " "Please use opset 11 or higher to export model for dynamic input shape.'") - if offsets.type().sizes() is not None: + offsets_dim_0 = sym_help._get_tensor_dim_size(offsets, 0) + if offsets_dim_0 is not None: if include_last_offset: - offset_len = offsets.type().sizes()[0] - 1 + offset_len = offsets_dim_0 - 1 offsets_extended = offsets else: - offset_len = offsets.type().sizes()[0] + offset_len = offsets_dim_0 offsets_extended = [offsets, g.op("Constant", value_t=torch.tensor([maxsize]))] offsets_extended = g.op("Concat", *offsets_extended, axis_i=0) list_ = [] diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index de2acf6085a0..6e9fe3f27060 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -97,21 +97,21 @@ def index_put(g, self, indices_list_value, values, accumulate=False): # %28 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]() # %29 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]() # %15 : None = prim::Constant() - # %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = + # %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = # aten::to(%8, %26, %27, %11, %12, %28, %29, %15) # %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]() # %30 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]() # %22 : int[] = prim::Constant[value=[-1]]() # %23 : Tensor = aten::view(%16, %22) # %24 : Tensor?[] = prim::ListConstruct(%23) - # %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = + # %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = # aten::index_put(%mask, %24, %18, %30) # return (%25) # # after graph(%0 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu), # %some_const : Float(requires_grad=0, device=cpu)): # %3 : Tensor = onnx::Equal(%0, %some_const) - # %4 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = onnx::Not(%3) + # %4 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = onnx::Not(%3) # %12 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = onnx::Cast[to=9](%4) # %19 : Tensor = onnx::Cast[to=9](%12) # %20 : Tensor = onnx::Constant[value={1}]() @@ -137,7 +137,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False): # %37 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]() # %22 : None = prim::Constant() # %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) - # = aten::to(%15, %34, %35, %18, %19, %36, %37, %22) + # = aten::to(%15, %34, %35, %18, %19, %36, %37, %22) # %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]() # %30 : int[] = prim::Constant[value=[-1]]() # %31 : Tensor = aten::view(%23, %30) @@ -148,7 +148,7 @@ def index_put(g, self, indices_list_value, values, accumulate=False): # # after graph(%0 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu), # %some_const : Float(requires_grad=0, device=cpu)): - # %3 : Float(8, strides=[1], requires_grad=0, device=cpu) + # %3 : Float(8, strides=[1], requires_grad=0, device=cpu) # = onnx::Constant[value= 1 1 1 1 1 1 1 1 [ CPUFloatType{8} ]]() # %4 : Tensor = onnx::Equal(%0, %some_const) # %5 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = onnx::Not(%4) @@ -168,17 +168,17 @@ def index_put(g, self, indices_list_value, values, accumulate=False): # %32 : Tensor = onnx::Constant[value={0}]() # %33 : Tensor = onnx::Unsqueeze[axes=[0]](%32) # %34 : Tensor = onnx::Slice(%24, %30, %31, %33) - # %35 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) + # %35 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) # = onnx::ScatterND(%0, %22, %34) # return (%35) bool_inp = list(index.node().inputs())[0] if bool_inp.type() is not None and bool_inp.type().scalarType() == 'Bool': - if values.type() is not None: - if values.type().dim() == 0: - from torch.onnx.symbolic_opset9 import masked_fill - return masked_fill(g, self, bool_inp, values) - return masked_scatter(g, self, bool_inp, values) + rank = sym_help._get_tensor_rank(values) + if rank is not None and rank == 0: + from torch.onnx.symbolic_opset9 import masked_fill + return masked_fill(g, self, bool_inp, values) + return masked_scatter(g, self, bool_inp, values) broadcast_index_shape = g.op("Shape", index) index = g.op("Unsqueeze", index, axes_i=[-1]) sub_data_shape = sym_help._slice_helper( @@ -201,8 +201,8 @@ def index_put(g, self, indices_list_value, values, accumulate=False): @parse_args('v', 'i') def pixel_shuffle(g, self, upscale_factor): - dims = self.type().sizes() - if len(dims) != 4: + rank = sym_help._get_tensor_rank(self) + if rank is not None and rank != 4: return _unimplemented("pixel_shuffle", "only support 4d input") return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD") @@ -280,11 +280,12 @@ def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_s "while exporting interpolate. Assuming that it is not a scalar.") if is_scalar: - if not input.type().dim(): + rank = sym_help._get_tensor_rank(input) + if rank is None: return sym_help._unimplemented("interpolate (with a scalar output_size)", "missing input shape (try giving an array of output_size values)") size = unsqueeze(g, size, 0) - size = [size for i in range(input.type().dim() - 2)] + size = [size for i in range(rank - 2)] size = g.op("Concat", *size, axis_i=0) size = g.op("Cast", size, to_i=sym_help.cast_pytorch_to_onnx['Long']) size = g.op("Concat", input_size, size, axis_i=0) @@ -299,9 +300,10 @@ def __interpolate(g, input, size, scale_factor, mode, align_corners, recompute_s mode_s=mode, # nearest, linear, or cubic nearest_mode_s="floor") else: # if not sym_help._is_none(scales) - if not input.type().dim(): + rank = sym_help._get_tensor_rank(input) + if rank is None: return sym_help._unimplemented("interpolate (with scales)", "missing input shape") - scales = sym_help._interpolate_get_scales(g, scale_factor, input.type().dim()) + scales = sym_help._interpolate_get_scales(g, scale_factor, rank) return g.op("Resize", input, roi, @@ -549,19 +551,19 @@ def constant_pad_nd(g, input, padding, value=None): mode = "constant" value = sym_help._maybe_get_scalar(value) value = sym_help._if_scalar_type_as(g, value, input) - pad = _prepare_onnx_paddings(g, input.type().dim(), padding) + pad = _prepare_onnx_paddings(g, sym_help._get_tensor_rank(input), padding) return g.op("Pad", input, pad, value, mode_s=mode) def reflection_pad(g, input, padding): mode = "reflect" - paddings = _prepare_onnx_paddings(g, input.type().dim(), padding) + paddings = _prepare_onnx_paddings(g, sym_help._get_tensor_rank(input), padding) return g.op("Pad", input, paddings, mode_s=mode) def replication_pad(g, input, padding): mode = "edge" - paddings = _prepare_onnx_paddings(g, input.type().dim(), padding) + paddings = _prepare_onnx_paddings(g, sym_help._get_tensor_rank(input), padding) return g.op("Pad", input, paddings, mode_s=mode) @@ -639,9 +641,12 @@ def squeeze(g, self, dim=None): dim = sym_help._get_const(dim, 'i', 'dim') - input_shape = self.type().sizes() - from torch.onnx.symbolic_helper import _onnx_shape_inference - if input_shape is None or not _onnx_shape_inference: + input_rank = sym_help._get_tensor_rank(self) + adjusted_dim = dim + if input_rank is not None and dim < 0: + adjusted_dim += input_rank + dim_size = sym_help._get_tensor_dim_size(self, adjusted_dim) + if (dim < 0 and input_rank is None) or dim_size is None: # If onnx shape inference is not on, export always as dynamic. # Because we cannot tell if observed static shape is also static at runtime. # create 'cond' node (condition is shape[i]==1) @@ -661,11 +666,10 @@ def squeeze(g, self, dim=None): return if_node_outputs # For static input shape - if dim < 0: - dim += self.type().dim() - if input_shape[dim] > 1: + dim = adjusted_dim + if dim_size > 1: warnings.warn("This model contains a squeeze operation on dimension " + str(dim) + ". The size of " + - "this dimension in the given input is " + str(input_shape[dim]) + ". The model will " + + "this dimension in the given input is " + str(dim_size) + ". The model will " + "be exported without the squeeze node. If the model is intended to be used with dynamic " + "input shapes, please export with dynamic_axes argument.") return self @@ -861,7 +865,7 @@ def narrow(g, input, dim, start, length): @parse_args('v', 'i', 'i') def flatten(g, input, start_dim, end_dim): - dim = input.type().dim() + dim = sym_help._get_tensor_rank(input) # use ONNX's Flatten operator for cases where the output shape is 2D if start_dim == 1: if (end_dim == -1 or (dim is not None and end_dim == dim - 1)): diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py index e4023dab2320..1fa9fa5e985b 100644 --- a/torch/onnx/symbolic_opset8.py +++ b/torch/onnx/symbolic_opset8.py @@ -148,10 +148,9 @@ def matmul(g, self, other): def prelu(g, self, weight): - if self.isCompleteTensor(): - self_sizes = self.type().sizes() - if self_sizes and len(self_sizes) > 2: - weight = g.op("Unsqueeze", weight, axes_i=list(range(1, len(self_sizes) - 1))) + self_rank = sym_help._get_tensor_rank(self) + if self_rank is not None and self_rank > 2: + weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1))) if _try_get_scalar_type(self): old_type, self, weight = _try_cast_integer_to_float(g, self, weight) return _cast_to_type(g, g.op("PRelu", self, weight), old_type) @@ -267,7 +266,7 @@ def full_like(g, input, fill_value, dtype, layout, device, pin_memory=False, mem def repeat(g, self, repeats): if not sym_help._is_value(repeats): repeats = g.op("Constant", value_t=torch.LongTensor(repeats)) - if sym_help._is_packed_list(repeats): + if sym_help._is_packed_list(repeats): repeat_size_len = len(sym_help._unpack_list(repeats)) else: const_repeats = sym_help._maybe_get_const(repeats, 'is') diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 8630f48a62ad..bda62b638d22 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -212,9 +212,9 @@ def matmul(g, self, other): @parse_args('v', 'v', 'v', 't', 't') def addmm(g, self, mat1, mat2, beta, alpha): dtype = None - self_dtype = self.type().scalarType() - mat1_dtype = mat1.type().scalarType() - mat2_dtype = mat2.type().scalarType() + self_dtype = sym_help._try_get_scalar_type(self) + mat1_dtype = sym_help._try_get_scalar_type(mat1) + mat2_dtype = sym_help._try_get_scalar_type(mat2) if self_dtype is not None: dtype = self_dtype elif mat1_dtype is not None: @@ -222,8 +222,8 @@ def addmm(g, self, mat1, mat2, beta, alpha): elif mat2_dtype is not None: dtype = mat2_dtype - mat1_rank = mat1.type().dim() - mat2_rank = mat2.type().dim() + mat1_rank = sym_help._get_tensor_rank(mat1) + mat2_rank = sym_help._get_tensor_rank(mat2) def isNotNoneAnd(v, u): return v is not None and v != u @@ -463,8 +463,8 @@ def size(g, self, dim=None): if dim is None: return g.op("Shape", self) if sym_help._maybe_get_const(dim, 'i') < 0: - rank = self.type().dim() - if rank: + rank = sym_help._get_tensor_rank(self) + if rank is not None: dim = sym_help._maybe_get_const(dim, 'i') + rank dim = g.op("Constant", value_t=torch.tensor(dim)) return sym_help._size_helper(g, self, dim) @@ -476,8 +476,9 @@ def transpose(g, self, dim0, dim1): return self # NB: Transpose in ONNX is actually a Permute - if self.isCompleteTensor(): - axes = list(range(self.type().dim())) + rank = sym_help._get_tensor_rank(self) + if rank is not None: + axes = list(range(rank)) axes[dim0], axes[dim1] = axes[dim1], axes[dim0] return g.op("Transpose", self, perm_i=axes) else: @@ -512,7 +513,9 @@ def view_as(g, self, other): def prim_ConstantSplit(g, self, split_size, dim): - size = self.type().sizes()[dim] + size = sym_help._get_tensor_dim_size(self, dim) + if size is None: + return _unimplemented('prim::ConstantSplit', 'unknown dimension size') splits = [split_size] * (size // split_size) leftover = size % split_size if leftover: @@ -525,7 +528,10 @@ def prim_ConstantSplit(g, self, split_size, dim): # TODO: Once we have proper scoping, stop reimplementing chunk, delete this # method, and use the desugared version def prim_ConstantChunk(g, self, chunks, dim): - split_size = (self.type().sizes()[dim] + chunks - 1) // chunks + dim_size = sym_help._get_tensor_dim_size(self, dim) + if dim_size is None: + return _unimplemented('prim::ConstantChunk', 'unknown dimension size') + split_size = (dim_size + chunks - 1) // chunks return prim_ConstantSplit(g, self, split_size, dim) @@ -533,8 +539,10 @@ def prim_ConstantChunk(g, self, chunks, dim): def unsafe_chunk(g, self, chunks, dim, _outputs=None): if _outputs is None: return sym_help._onnx_opset_unsupported_detailed('unsafe_chunk', 9, 11, 'Dynamic number of outputs not supported') - split_size = (self.type().sizes()[dim] + chunks - 1) // chunks - size = self.type().sizes()[dim] + size = sym_help._get_tensor_dim_size(self, dim) + if size is None: + return _unimplemented('unsafe_chunk', 'unknown dimension size') + split_size = (size + chunks - 1) // chunks splits = [split_size] * (size // split_size) leftover = size % split_size if leftover: @@ -552,7 +560,9 @@ def split(g, self, split_size_or_sizes, dim, _outputs=None): split_size = sym_help._get_const(split_size_or_sizes, 'i', 'split_size') dim = sym_help._get_const(dim, 'i', 'dim') - size = self.type().sizes()[dim] + size = sym_help._get_tensor_dim_size(self, dim) + if size is None: + return sym_help._onnx_opset_unsupported_detailed('split', 9, 11, 'Unknown dimension size not supported') splits = [split_size] * (size // split_size) leftover = size % split_size if leftover: @@ -607,8 +617,8 @@ def squeeze(g, self, dim=None): squeeze_dim = sym_help._get_const(dim, 'i', 'dim') # Handle negative dims if squeeze_dim < 0: - rank = self.type().dim() - if rank: + rank = sym_help._get_tensor_rank(self) + if rank is not None: warnings.warn("ONNX export squeeze with negative axis " + str(squeeze_dim) + " might cause the onnx model to be incorrect. " + "Negative axis is not supported in ONNX. " + @@ -619,17 +629,17 @@ def squeeze(g, self, dim=None): else: return _unimplemented('squeeze', 'negative axis with unknown input rank') - input_shape = self.type().sizes() - if input_shape is None: + dim_size = sym_help._get_tensor_dim_size(self, squeeze_dim) + if dim_size is None: warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + " on an input " + "with unknown shape. Note that if the size of dimension " + str(squeeze_dim) + " of the input " + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on " + "non-singleton dimensions, it is recommended to export this model using opset " + "version 11 or higher.") return g.op("Squeeze", self, axes_i=[squeeze_dim]) - if input_shape[squeeze_dim] > 1: + if dim_size > 1: warnings.warn("This model contains a squeeze operation on dimension " + str(squeeze_dim) + ". The size of " + - "this dimension in the given input is " + str(input_shape[squeeze_dim]) + ". The model will " + + "this dimension in the given input is " + str(dim_size) + ". The model will " + "be exported without the squeeze node. If the model is intended to be used with dynamic " + "input shapes, please use opset version 11 to " + "export the model.") @@ -640,10 +650,9 @@ def squeeze(g, self, dim=None): return g.op("Squeeze", self, axes_i=[squeeze_dim]) def prelu(g, self, weight): - if self.isCompleteTensor(): - self_sizes = self.type().sizes() - if self_sizes and len(self_sizes) > 2: - weight = g.op("Unsqueeze", weight, axes_i=list(range(1, len(self_sizes) - 1))) + self_rank = sym_help._get_tensor_rank(self) + if self_rank is not None and self_rank > 2: + weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1))) return g.op("PRelu", self, weight) @@ -683,7 +692,9 @@ def leaky_relu(g, input, negative_slope, inplace=False): @parse_args('v', 'i') def glu(g, input, dim): - assert input.type().sizes()[dim] % 2 == 0 + dim_size = sym_help._get_tensor_dim_size(input, dim) + if dim_size is not None: + assert dim_size % 2 == 0 first, second = g.op('Split', input, axis_i=dim, outputs=2) return g.op('Mul', first, g.op('Sigmoid', second)) @@ -711,7 +722,7 @@ def softmax(g, input, dim, dtype=None): # otherwise transpose the input to put the vectors to be normalized to the last dimension. # When input rank is not known at export time we compute softmax using a subgraph # with other operators - input_dim = input.type().dim() + input_dim = sym_help._get_tensor_rank(input) if input_dim is not None: # TODO: remove this as onnx opset 11 spec allows negative axes if dim < 0: @@ -753,7 +764,10 @@ def softplus(g, self, beta, threshold): def get_pool_ceil_padding(input, kernel_size, stride, padding): - dim = input.type().sizes()[-len(padding):] + sizes = sym_help._get_tensor_sizes(input) + dim = sizes[-len(padding):] if sizes is not None else None + if dim is None or any([i is None for i in dim]): + return _unimplemented(name, "input size not accessible") ceiled_output_dim = [int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i]))) + 1 for i in range(0, len(padding))] # ensure last pooling starts inside @@ -778,8 +792,6 @@ def get_pool_ceil_padding(input, kernel_size, stride, padding): def _max_pool(name, tuple_fn, ndims, return_indices): @parse_args('v', 'is', 'is', 'is', 'is', 'i') def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode): - if ceil_mode and not input.isCompleteTensor(): - return _unimplemented(name, "input size not accessible") if set(tuple_fn(dilation)) != {1}: return _unimplemented(name, "dilation") if not stride: @@ -836,8 +848,6 @@ def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode): def _avg_pool(name, tuple_fn): @parse_args('v', 'is', 'is', 'is', 'i', 'i', 'none') def symbolic_fn(g, input, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override=None): - if ceil_mode and not input.isCompleteTensor(): - return _unimplemented(name, "input size not accessible") if not stride: stride = kernel_size padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override, name) @@ -883,11 +893,15 @@ def symbolic_fn(g, input, output_size): return sym_help._onnx_unsupported('adaptive pooling, since output_size is not constant.') if output_size == [1] * len(output_size) and type == "AveragePool": return g.op("GlobalAveragePool", input) - if not input.isCompleteTensor(): + sizes = sym_help._get_tensor_sizes(input) + try: + dim = sizes[2:] + except Exception: + dim = None + if dim is None or any([i is None for i in dim]): if output_size == [1] * len(output_size): return g.op("GlobalMaxPool", input), None return _unimplemented(name, 'input size not accessible') - dim = input.type().sizes()[2:] # verify if output size % input size = 0 for all dim mod = [dim[i] % output_size[i] for i in range(0, len(dim))] if mod != [0] * len(mod): @@ -951,21 +965,21 @@ def constant_pad_nd(g, input, padding, value): return sym_help._onnx_opset_unsupported_detailed('Pad', 9, 11, 'The value for the padding must be constant') padding = _convert_padding_node(padding) - paddings = _prepare_onnx_paddings(input.type().dim(), padding) + paddings = _prepare_onnx_paddings(sym_help._get_tensor_rank(input), padding) return g.op("Pad", input, pads_i=paddings, mode_s=mode, value_f=value) def reflection_pad(g, input, padding): mode = "reflect" padding = _convert_padding_node(padding) - paddings = _prepare_onnx_paddings(input.type().dim(), padding) + paddings = _prepare_onnx_paddings(sym_help._get_tensor_rank(input), padding) return g.op("Pad", input, pads_i=paddings, mode_s=mode) def replication_pad(g, input, padding): mode = "edge" padding = _convert_padding_node(padding) - paddings = _prepare_onnx_paddings(input.type().dim(), padding) + paddings = _prepare_onnx_paddings(sym_help._get_tensor_rank(input), padding) return g.op("Pad", input, pads_i=paddings, mode_s=mode) @@ -1135,7 +1149,7 @@ def log_softmax(g, input, dim, dtype=None): # PyTorch dim and ONNX axis have different meanings. # See Softmax comment for details. # TODO: remove this as onnx opset 11 spec allows negative axes - input_dim = input.type().dim() + input_dim = sym_help._get_tensor_rank(input) if input_dim is None: return _unimplemented("dim", "ONNX and PyTorch use different strategies to split the input. " @@ -1161,11 +1175,19 @@ def log_softmax(g, input, dim, dtype=None): @parse_args('v', 'v', 'v', 'is', 'is', 'is', 'i', 'is', 'i', 'i', 'i', 'i', 'i') def _convolution(g, input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32): - weight_size = weight.type().sizes() + weight_size = sym_help._get_tensor_sizes(weight) + try: + kernel_shape = weight_size[2:] + except Exception: + kernel_shape = None + + if kernel_shape is None or any([i is None for i in kernel_shape]): + raise RuntimeError('Unsupported: ONNX export of convolution for kernel ' + 'of unknown shape.') args = [input, weight] # ONNX only supports 1D bias - if not sym_help._is_none(bias) and bias.type().dim() == 1: + if not sym_help._is_none(bias) and sym_help._get_tensor_rank(bias) == 1: args.append(bias) kwargs = {"kernel_shape_i": weight_size[2:], @@ -1186,7 +1208,7 @@ def _convolution(g, input, weight, bias, stride, padding, dilation, n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs) - if not sym_help._is_none(bias) and bias.type().dim() != 1: + if not sym_help._is_none(bias) and sym_help._get_tensor_rank(bias) != 1: return g.op("Add", n, bias) else: return n @@ -1225,26 +1247,31 @@ def conv_transpose3d(g, input, weight, bias, stride, padding, output_padding, gr @parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i') def batch_norm(g, input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled): sym_help.assert_training_mode(training, "batch_norm") - input_sizes = input.type().sizes() + batch_size = sym_help._get_tensor_dim_size(input, 0) + channel_size = sym_help._get_tensor_dim_size(input, 1) if weight is None or sym_help._is_none(weight): - assert len(input_sizes) > 1 - weight_value = torch.tensor([1.] * input_sizes[1]).type( + if channel_size is None: + raise RuntimeError('Unsupported: ONNX export of batch_norm for unknown ' + 'channel size.') + weight_value = torch.tensor([1.] * channel_size).type( 'torch.' + input.type().scalarType() + 'Tensor') weight = g.op("Constant", value_t=weight_value) if bias is None or sym_help._is_none(bias): - assert len(input_sizes) > 1 - bias_value = torch.tensor([0.] * input_sizes[1]).type( + if channel_size is None: + raise RuntimeError('Unsupported: ONNX export of batch_norm for unknown ' + 'channel size.') + bias_value = torch.tensor([0.] * channel_size).type( 'torch.' + input.type().scalarType() + 'Tensor') bias = g.op("Constant", value_t=bias_value) - # If track_running_stats is set to False batch statistics are instead used during evaluation time + # If track_running_stats is set to False batch statistics are instead used during evaluation time if running_mean is None or sym_help._is_none(running_mean) or running_var is None or sym_help._is_none(running_var): - assert len(input_sizes) > 1 - reshape_in = g.op("Reshape", input, - g.op("Constant", value_t=torch.tensor([input_sizes[0], input_sizes[1], -1], dtype=torch.int64))) + assert batch_size is not None and channel_size is not None + reshape_in = g.op("Reshape", input, + g.op("Constant", value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64))) trans_in = g.op('Transpose', reshape_in, perm_i=[0, 2, 1]) - running_var, running_mean = _var_mean(g, trans_in, - g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)), + running_var, running_mean = _var_mean(g, trans_in, + g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)), False, False) out = g.op("BatchNormalization", input, weight, bias, running_mean, running_var, epsilon_f=eps, @@ -1290,15 +1317,19 @@ def layer_norm(g, input, normalized_shape, weight, bias, eps, cudnn_enable): @parse_args('v', 'v', 'v', 'v', 'v', 'i', 'f', 'f', 'i') def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled): - input_sizes = input.type().sizes() + channel_size = sym_help._get_tensor_dim_size(input, 1) if weight is None or sym_help._is_none(weight): - assert len(input_sizes) > 1 - weight_value = torch.tensor([1.] * input_sizes[1]).type( + if channel_size is None: + raise RuntimeError('Unsupported: ONNX export of instance_norm for unknown ' + 'channel size.') + weight_value = torch.tensor([1.] * channel_size).type( 'torch.' + input.type().scalarType() + 'Tensor') weight = g.op("Constant", value_t=weight_value) if bias is None or sym_help._is_none(bias): - assert len(input_sizes) > 1 - bias_value = torch.tensor([0.] * input_sizes[1]).type( + if channel_size is None: + raise RuntimeError('Unsupported: ONNX export of instance_norm for unknown ' + 'channel size.') + bias_value = torch.tensor([0.] * channel_size).type( 'torch.' + input.type().scalarType() + 'Tensor') bias = g.op("Constant", value_t=bias_value) return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps) @@ -1308,13 +1339,17 @@ def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_s def unfold(g, input, dimension, size, step): if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK: return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step) - if input.isCompleteTensor(): - sizedim = input.type().sizes()[dimension] + sizes = sym_help._get_tensor_sizes(input) + try: + sizedim = sizes[dimension] + except Exception: + sizedim = None + if sizedim is not None: low_indices = range(0, sizedim, step) hi_indices = range(size, sizedim + 1, step) stack = [sym_help._slice_helper(g, input, axes=[dimension], starts=[low], ends=[hi]) for low, hi in zip(low_indices, hi_indices)] - ndim = input.type().dim() + ndim = len(sizes) perm = list(range(0, ndim)) perm.append(perm.pop(dimension)) unsqueeze = [g.op("Unsqueeze", g.op("Transpose", t, perm_i=perm), axes_i=[dimension]) for t in stack] @@ -1375,11 +1410,12 @@ def index_copy(g, self, dim, index, source): def type_as(g, self, other): - if self.isCompleteTensor() and other.isCompleteTensor() and self.type().scalarType() == other.type().scalarType(): + self_dtype = sym_help._try_get_scalar_type(self) + other_dtype = sym_help._try_get_scalar_type(other) + if self_dtype == other_dtype and self_dtype is not None: return self - if other.isCompleteTensor(): - other_type_name = other.type().scalarType() - return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[other_type_name]) + if other_dtype is not None: + return g.op("Cast", self, to_i=sym_help.cast_pytorch_to_onnx[other_dtype]) else: if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK: # We don't know the type of other, bail by emitting ATen @@ -1575,8 +1611,9 @@ def empty_like(g, input, dtype=None, layout=None, device=None, pin_memory=False, def new_empty(g, self, sizes, dtype, layout, device, pin_memory=False): - if dtype is None and self.isCompleteTensor(): - dtype = self.type().scalarType() + self_dtype = sym_help._try_get_scalar_type(self) + if dtype is None and self_dtype is not None: + dtype = self_dtype dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype]) return empty(g, sizes, dtype, layout, device, pin_memory) @@ -1628,8 +1665,9 @@ def zeros_like(g, input, dtype=None, layout=None, device=None, pin_memory=False, def new_zeros(g, self, sizes, dtype, layout, device, pin_memory=False): - if dtype is None and self.isCompleteTensor(): - dtype = self.type().scalarType() + self_dtype = sym_help._try_get_scalar_type(self) + if dtype is None and self_dtype is not None: + dtype = self_dtype dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype]) return zeros(g, sizes, dtype, layout, device, pin_memory) @@ -1679,8 +1717,9 @@ def full_like(g, input, fill_value, dtype=None, layout=None, device=None, pin_me def new_full(g, self, size, fill_value, dtype, layout, device, pin_memory=False): - if dtype is None and self.isCompleteTensor(): - dtype = self.type().scalarType() + self_dtype = sym_help._try_get_scalar_type(self) + if dtype is None and self_dtype is not None: + dtype = self_dtype dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype]) return full(g, size, fill_value, dtype, layout, device, pin_memory) @@ -1745,8 +1784,8 @@ def hardtanh(g, self, min_val, max_val): @parse_args('v') def hardswish(g, self): input = g.op("Add", self, g.op('Constant', value_t=torch.tensor(3, dtype=torch.float))) - hardtanh_ = sym_help._hardtanh_helper(g, input, - g.op('Constant', value_t=torch.tensor(0, dtype=torch.float)), + hardtanh_ = sym_help._hardtanh_helper(g, input, + g.op('Constant', value_t=torch.tensor(0, dtype=torch.float)), g.op('Constant', value_t=torch.tensor(6, dtype=torch.float))) hardtanh_ = g.op("Div", hardtanh_, g.op('Constant', value_t=torch.tensor(6, dtype=torch.float))) return g.op("Mul", self, hardtanh_) @@ -1759,8 +1798,8 @@ def alias(g, self): def unsqueeze(g, self, dim): # Handle negative dim if dim < 0: - rank = self.type().dim() - if rank: + rank = sym_help._get_tensor_rank(self) + if rank is not None: warnings.warn("ONNX export unsqueeze with negative axis " + str(dim) + " might cause the onnx model to be incorrect. " + "Negative axis is not supported in ONNX. " + @@ -1778,10 +1817,16 @@ def unsqueeze(g, self, dim): def sort(g, self, dim, decending, out=None): if out is not None: _unimplemented("Sort", "Out parameter is not supported for sort") - if not self.isCompleteTensor(): + self_sizes = sym_help._get_tensor_sizes(self) + try: + dim_size = self_sizes[dim] + except Exception: + dim_size = None + + if dim_size is None: return _unimplemented("Sort", "input size not accessible") - return g.op("TopK", self, k_i=self.type().sizes()[dim], axis_i=dim, outputs=2) + return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2) def numel(g, self): @@ -1844,9 +1889,11 @@ def repeat(g, self, repeats): @parse_args('v', 'i') def pixel_shuffle(g, self, upscale_factor): - dims = self.type().sizes() + dims = sym_help._get_tensor_sizes(self) if len(dims) != 4: return _unimplemented("pixel_shuffle", "only support 4d input") + if any([i is None for i in dims[1:]]): + return _unimplemented("pixel_shuffle", "only support static input shape, except for batch size") output_channel = dims[1] // upscale_factor // upscale_factor after_view = view(g, self, g.op("Constant", value_t=torch.tensor([-1, output_channel, upscale_factor, upscale_factor, dims[2], dims[3]]))) @@ -1882,7 +1929,9 @@ def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases, variant = 'RNN' w_hh = all_weights[1] - hidden_size = w_hh.type().sizes()[1] + hidden_size = sym_help._get_tensor_dim_size(w_hh, 1) + if hidden_size is None: + return _unimplemented("RNN/GRU/LSTM", "unknown hidden size") unidirectional = not bidirectional @@ -2166,7 +2215,7 @@ def erf(g, input): @parse_args('v', 'i', 'i') def flatten(g, input, start_dim, end_dim): - dim = input.type().dim() + dim = sym_help._get_tensor_rank(input) if dim is None: return _unimplemented("dim", "ONNX and PyTorch use different strategies to split the input. " @@ -2241,13 +2290,16 @@ def scatter(g, self, dim, index, src): @parse_args('v', 'i', 'v', 'v') def scatter_add(g, self, dim, index, src): - if not self.isCompleteTensor(): - return _unimplemented("scatter_add", "input size not accessible") - dtype = self.type().scalarType() + dtype = sym_help._try_get_scalar_type(self) + if dtype is None: + return _unimplemented("scatter_add", "input dtype not accessible") dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype]) dtype = sym_help.scalar_type_to_pytorch_type[dtype] - sizes = self.type().sizes() - to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=dtype)) + sizes = sym_help._get_tensor_sizes(self, allow_nonstatic=False) + if sizes: + to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=dtype)) + else: + to_add = zeros_like(self, dtype) to_add = sym_help._scatter_helper(g, to_add, dim, index, src) return add(g, self, to_add) @@ -2491,7 +2543,7 @@ def try_mask_to_index(index): elif len(adv_idx_indices) == 1: return index_select(g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]) else: - rank = self.type().dim() + rank = sym_help._get_tensor_rank(self) if rank is None: raise NotImplementedError("Unsupported aten::index operator of advanced indexing on tensor of unknown rank, " + "try turning on shape and type propagate during export: " + @@ -2503,7 +2555,6 @@ def try_mask_to_index(index): " is achieved by combination of multiple ONNX operators, " + "including Reshape, Transpose, Concat, and Gather. " + "If indices include negative values, the exported graph will produce incorrect results.") - rank = self.type().dim() adv_idx_count = len(adv_idx_indices) shape_tensor = _shape_as_tensor(g, self) dim_tensor_list = [ @@ -2622,8 +2673,12 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): return g.op("ATen", input, weight, bias, num_groups_i=num_groups, eps_f=eps, cudnn_enabled_i=cudnn_enabled, operator_s="group_norm") - input_sizes = input.type().sizes() - assert input_sizes[1] % num_groups == 0 + channel_size = sym_help._get_tensor_dim_size(input, 1) + if channel_size is not None: + assert channel_size % num_groups == 0 + input_rank = sym_help._get_tensor_rank(input) + if input_rank is None: + return _unimplemented("group_norm", "unknown input rank") # 0 in the shape list keeps dimension value unchanged. shape = [0, num_groups, -1] input_reshaped = g.op('Reshape', input, g.op('Constant', value_t=torch.LongTensor(shape))) @@ -2649,14 +2704,14 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): bias = g.op("Constant", value_t=bias_value) # Norm has shape [N, C, *] so we reshape weight and bias to [C, *] - axes = list(range(1, len(input_sizes) - 1)) + axes = list(range(1, input_rank - 1)) return add(g, mul(g, norm, g.op("Unsqueeze", weight, axes_i=axes)), g.op("Unsqueeze", bias, axes_i=axes)) @parse_args('v', 'v', 'i') def _weight_norm(g, weight_v, weight_g, dim): - rank = weight_v.type().dim() - if rank: + rank = sym_help._get_tensor_rank(weight_v) + if rank is not None: # W = g * ((v) / ||v||) # Compute norm_except_dim for l2 norm. dim = None means over all dims # torch's weight_norm module sets dim = -1 if it's None. From 44f33596d315ba1c2356fd5363bddd7c97191cf9 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Wed, 9 Dec 2020 12:12:41 -0800 Subject: [PATCH 077/250] [pe] Add gflags for num_profiled_runs and bailout_depth, laint (#49059) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49059 We'd like to be able to change these defaults without rebuilding the library. ghstack-source-id: 118140486 Test Plan: `buck build //caffe2/test:jit` Reviewed By: eellison Differential Revision: D25405568 fbshipit-source-id: 5d0561a64127adc44753e48d3b6c7f560c8b5820 --- .../runtime/profiling_graph_executor_impl.cpp | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp index dc6f50350bd0..31750636d762 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp @@ -35,6 +35,18 @@ C10_DEFINE_bool( true, "If this flag is set to false TorchScript will be using the legacy/original executor"); +constexpr size_t kDefaultNumProfiledRuns = 1; +constexpr size_t kDefaultBailoutDepth = 20; + +C10_DEFINE_int64( + torch_jit_num_profiled_runs, + kDefaultNumProfiledRuns, + "Number of profiling runs"); +C10_DEFINE_int64( + torch_jit_bailout_depth, + kDefaultBailoutDepth, + "Number of re-specializations"); + namespace torch { namespace jit { @@ -46,21 +58,32 @@ static std::atomic executor_mode{true}; static std::atomic profiling_mode{true}; #endif -static std::atomic num_profiled_runs{1}; -static std::atomic bailout_depth{20}; // NOLINT +static std::atomic num_profiled_runs{kDefaultNumProfiledRuns}; +static std::atomic bailout_depth{kDefaultBailoutDepth}; std::atomic& getProfilingMode() { return profiling_mode; } + std::atomic& getExecutorMode() { return executor_mode; } std::atomic& getNumProfiledRuns() { + // Initialize num_profiled_runs from command-line flag. + static const size_t init = []() { + return num_profiled_runs = FLAGS_torch_jit_num_profiled_runs; + }(); + (void)init; // Silence clang-tidy. return num_profiled_runs; } std::atomic& getBailoutDepth() { + // Initialize bailout_depth from command-line flag. + static const size_t init = []() { + return bailout_depth = FLAGS_torch_jit_bailout_depth; + }(); + (void)init; // Silence clang-tidy. return bailout_depth; } From b98e62f8ebd5e3a8f6956972afcb6847030351cf Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Wed, 9 Dec 2020 12:12:41 -0800 Subject: [PATCH 078/250] [te] Add gflag for fast intrinsic expansion (#49060) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49060 TE contains a fast tanh/sigmoid implementation that may be slightly less precise than the eager implementation (I measured 1 ulp in some test cases). We disabled it by default using an #ifdef but that may be too conservative. Adding a gflag allows more testing without recompilation. ghstack-source-id: 118140487 Test Plan: `buck test //caffe2/test:jit` Reviewed By: eellison Differential Revision: D25406421 fbshipit-source-id: 252b64091edfff878d2585e77b0a6896aa096ea5 --- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 509015f7ffa5..cb14b9ef4c07 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -35,6 +35,11 @@ using namespace torch::jit::tensorexpr; +C10_DEFINE_bool( + torch_jit_llvm_use_fast_intrinsics, + false, + "Use fast (but slightly less accurate) implementations of tanh and sigmoid"); + DEFINE_TRIGGER(llvm_codegen_created); DEFINE_TRIGGER(llvm_codegen_executed); @@ -496,12 +501,13 @@ void LLVMCodeGenImpl::emitKernel( irb_.SetInsertPoint(bb_); // Maybe expand some of the intrinsics. -#ifdef USE_FAST_CPU_INTRINSICS - LLVMIntrinsicsExpander intrinsics_expander; -#else - GenericIntrinsicsExpander intrinsics_expander; -#endif - stmt = stmt->accept_mutator(&intrinsics_expander); + if (FLAGS_torch_jit_llvm_use_fast_intrinsics) { + LLVMIntrinsicsExpander intrinsics_expander; + stmt = stmt->accept_mutator(&intrinsics_expander); + } else { + GenericIntrinsicsExpander intrinsics_expander; + stmt = stmt->accept_mutator(&intrinsics_expander); + } // Compile the kernel. stmt->accept(this); From c62f3fc40b22351b075004e9ef474c573bef4dea Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Wed, 9 Dec 2020 12:20:56 -0800 Subject: [PATCH 079/250] fix clang-tidy warning - make global TorchLibraryInit objects const (#48956) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48956 ghstack-source-id: 118140666 Test Plan: GitHub CI Reviewed By: ezyang Differential Revision: D25381418 fbshipit-source-id: 1726ed233b809054cb9e5ba89e02c84fb868c1eb --- torch/library.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/library.h b/torch/library.h index 41178fd10e07..ac936d29c520 100644 --- a/torch/library.h +++ b/torch/library.h @@ -643,7 +643,7 @@ class TorchLibraryInit final { /// for any given namespace. #define TORCH_LIBRARY(ns, m) \ static void TORCH_LIBRARY_init_ ## ns (torch::Library&); \ - static torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_ ## ns ( \ + static const torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_ ## ns ( \ torch::Library::DEF, \ &TORCH_LIBRARY_init_ ## ns, \ #ns, c10::nullopt, __FILE__, __LINE__ \ @@ -669,7 +669,7 @@ class TorchLibraryInit final { /// that it can only be called once for a given namespace. #define _TORCH_LIBRARY_FRAGMENT(ns, m, uid) \ static void C10_CONCATENATE(TORCH_LIBRARY_FRAGMENT_init_ ## ns ## _, uid) (torch::Library&); \ - static torch::detail::TorchLibraryInit C10_CONCATENATE(TORCH_LIBRARY_FRAGMENT_static_init_ ## ns ## _, uid) ( \ + static const torch::detail::TorchLibraryInit C10_CONCATENATE(TORCH_LIBRARY_FRAGMENT_static_init_ ## ns ## _, uid) ( \ torch::Library::FRAGMENT, \ &C10_CONCATENATE(TORCH_LIBRARY_FRAGMENT_init_ ## ns ## _, uid), \ #ns, c10::nullopt, __FILE__, __LINE__ \ @@ -725,7 +725,7 @@ class TorchLibraryInit final { /// and dispatch key in the same translation unit. #define _TORCH_LIBRARY_IMPL(ns, k, m, uid) \ static void C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_ ## ns ## _ ## k ## _, uid) (torch::Library&); \ - static torch::detail::TorchLibraryInit C10_CONCATENATE(TORCH_LIBRARY_IMPL_static_init_ ## ns ## _ ## k ## _, uid) ( \ + static const torch::detail::TorchLibraryInit C10_CONCATENATE(TORCH_LIBRARY_IMPL_static_init_ ## ns ## _ ## k ## _, uid) ( \ torch::Library::IMPL, \ c10::guts::if_constexpr( \ []() { return & C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_ ## ns ## _ ## k ## _, uid); }, \ From 7584161dfaa607e4f9e370e42e9045a036dc83c0 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Wed, 9 Dec 2020 12:27:05 -0800 Subject: [PATCH 080/250] Enhance `new_group` doc to mention using NCCL concurrently. (#48872) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48872 Using NCCL communicators concurrently is not safe and this is documented in NCCL docs. However, this is not documented in PyTorch and we should add documentation for ProcessGroupNCCL so that users are aware of this limitation. ghstack-source-id: 118148014 Test Plan: waitforbuildbot Reviewed By: rohan-varma Differential Revision: D25351778 fbshipit-source-id: f7f448dc834c47cc1244f821362f5437dd17ce77 --- torch/distributed/distributed_c10d.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 1081c6ee0e44..83260ec8dbdf 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -2349,6 +2349,17 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None): if they are not going to be members of the group. Additionally, groups should be created in the same order in all processes. + .. warning:: + Using multiple process groups with the ``NCCL`` backend concurrently + is not safe and the user should perform explicit synchronization in + their application to ensure only one process group is used at a time. + This means collectives from one process group should have completed + execution on the device (not just enqueued since CUDA execution is + async) before collectives from another process group are enqueued. + See `Using multiple NCCL communicators concurrently `_ for more details. + Arguments: ranks (list[int]): List of ranks of group members. If ``None``, will be set to all ranks. Default is ``None``. From f5e9ffbc279626ad8cabda49eed91dbe6399d3c4 Mon Sep 17 00:00:00 2001 From: Yixin Bao Date: Wed, 9 Dec 2020 12:33:15 -0800 Subject: [PATCH 081/250] Check CUDA kernel launches (/fbcode/caffe2/) (#49105) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49105 (1) Add a safety check `C10_CUDA_KERNEL_LAUNCH_CHECK()` after each kernel launch. This diff only changes the files inside the directory /fbsource/fbcode/caffe2/modules/, /fbsource/fbcode/caffe2/fb/, /fbsource/fbcode/caffe2/test/. (2) Get rid of old check `AT_CUDA_CHECK(cudaGetLastError())` when necessary. Test Plan: Test build: ``` buck build //caffe2/modules/detectron: buck build //caffe2/torch/fb/: ``` To check for launches without checks: ``` python3 caffe2/torch/testing/check_kernel_launches.py ``` Make sure none of the updated files are in the returned list. Reviewed By: r-barnes Differential Revision: D25325039 fbshipit-source-id: 2043d6e63c7d029c35576d3101c18247ffe92f01 --- modules/detectron/group_spatial_softmax_op.cu | 3 +++ modules/detectron/ps_roi_pool_op.cu | 2 ++ modules/detectron/roi_pool_f_op.cu | 2 ++ modules/detectron/select_smooth_l1_loss_op.cu | 2 ++ modules/detectron/sigmoid_cross_entropy_loss_op.cu | 5 +++++ modules/detectron/sigmoid_focal_loss_op.cu | 2 ++ modules/detectron/smooth_l1_loss_op.cu | 3 +++ modules/detectron/softmax_focal_loss_op.cu | 5 +++++ modules/detectron/spatial_narrow_as_op.cu | 2 ++ modules/detectron/upsample_nearest_op.cu | 3 +++ test/cpp_extensions/cuda_extension.cu | 1 + test/cpp_extensions/cuda_extension_kernel.cu | 1 + test/cpp_extensions/cuda_extension_kernel2.cu | 1 + torch/lib/c10d/test/CUDATest.cu | 1 + 14 files changed, 33 insertions(+) diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu index 92e89ae5acc2..a37a3fba55a7 100644 --- a/modules/detectron/group_spatial_softmax_op.cu +++ b/modules/detectron/group_spatial_softmax_op.cu @@ -112,6 +112,7 @@ bool GroupSpatialSoftmaxOp::RunOnDevice() { GroupSpatialSoftmaxKernel<<>>( N, A, W, H, Xdata, Pdata, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -158,11 +159,13 @@ bool GroupSpatialSoftmaxGradientOp::RunOnDevice() { SumProbsKernel<<>>( N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 2: dX[i] = dX[i] - s SubSumKernel<<>>( N, A, W, H, sum_probs_.data(), dXdata, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 3: dX[i] = Y[i] * dX[i] math::Mul(Y.size(), dXdata, Ydata, dXdata, &context_); diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu index 1ba418be5c99..68e4ec377d62 100644 --- a/modules/detectron/ps_roi_pool_op.cu +++ b/modules/detectron/ps_roi_pool_op.cu @@ -253,6 +253,7 @@ bool PSRoIPoolOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), output_dim_, group_size_, Y->mutable_data(), A->mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -276,6 +277,7 @@ bool PSRoIPoolGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, output_dim_, dX->mutable_data(), R.data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu index 62948f7eacbe..b261911b95a1 100644 --- a/modules/detectron/roi_pool_f_op.cu +++ b/modules/detectron/roi_pool_f_op.cu @@ -149,6 +149,7 @@ bool RoIPoolFOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), Y->mutable_data(), A->mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -173,6 +174,7 @@ bool RoIPoolFGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, dX->mutable_data(), R.data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; } diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu index 9065bfc7afbe..ce68fcff634d 100644 --- a/modules/detectron/select_smooth_l1_loss_op.cu +++ b/modules/detectron/select_smooth_l1_loss_op.cu @@ -129,6 +129,7 @@ bool SelectSmoothL1LossOp::RunOnDevice() { M, Y_hat.data(), Y.data(), L.data(), buff_.mutable_data(), S.data(), beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Sum of all losses // al := sum_i l_i @@ -175,6 +176,7 @@ bool SelectSmoothL1LossGradientOp::RunOnDevice() { D, H, W, M, Y_hat.data(), Y.data(), L.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_, S.data(), beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu index d69a7b41dc33..bb86560fcb01 100644 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu +++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu @@ -93,6 +93,8 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { T.data(), losses_.mutable_data(), counts_.mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + float* avg_loss_data = avg_loss->mutable_data(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -106,6 +108,7 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, avg_loss_data, normalizer_data, avg_loss_data, &context_); } @@ -135,6 +138,7 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { T.data(), dX->mutable_data(), counts_.mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (normalize_) { float* normalizer_data = normalizer_.mutable_data(); math::Sum( @@ -145,6 +149,7 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, d_avg_loss.data(), diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu index 5b130c8dfc1f..e6f2dea21b5d 100644 --- a/modules/detectron/sigmoid_focal_loss_op.cu +++ b/modules/detectron/sigmoid_focal_loss_op.cu @@ -134,6 +134,7 @@ bool SigmoidFocalLossOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), wp.data(), gamma_, alpha_, num_classes_, losses_.mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -165,6 +166,7 @@ bool SigmoidFocalLossGradientOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), dX->mutable_data(), wp.data(), gamma_, alpha_, num_classes_, d_avg_loss.data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu index 1a3e8b78b53f..ea835a4bc2b9 100644 --- a/modules/detectron/smooth_l1_loss_op.cu +++ b/modules/detectron/smooth_l1_loss_op.cu @@ -102,6 +102,7 @@ bool SmoothL1LossOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), buff_.mutable_data(), beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Element-wise weighted smooth l1 loss (can be used to specify a per-element // loss weight) @@ -164,6 +165,8 @@ bool SmoothL1LossGradientOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_ / N, beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + // Element-wise scale by alpha_in and alpha_out math::Mul( d_Y_hat->size(), d_Y_hat->data(), alpha_in.data(), diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu index 93635269f176..b7f8d2423ebc 100644 --- a/modules/detectron/softmax_focal_loss_op.cu +++ b/modules/detectron/softmax_focal_loss_op.cu @@ -176,6 +176,7 @@ bool SoftmaxFocalLossOp::RunOnDevice() { <<>>( N, A, H, W, Xdata, P->mutable_data(), num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Compute loss for each x,y location const int* Tdata = T.data(); @@ -184,6 +185,7 @@ bool SoftmaxFocalLossOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, P->data(), Tdata, losses_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // sum the losses float* avg_loss_data = avg_loss->mutable_data(); @@ -227,6 +229,8 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, Pdata, Tdata, buff_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + // Compute the gradient with the weights const float* Bdata = buff_.data(); SoftmaxFocalLossGradientKernel @@ -234,6 +238,7 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data(), dX->mutable_data(), num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu index 97ddc492eb07..ff8b5632e80a 100644 --- a/modules/detectron/spatial_narrow_as_op.cu +++ b/modules/detectron/spatial_narrow_as_op.cu @@ -115,6 +115,7 @@ bool SpatialNarrowAsOp::DoRunWithType() { out_width, A.template data(), C->template mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -152,6 +153,7 @@ bool SpatialNarrowAsGradientOp::DoRunWithType() { out_width, dC.template data(), dA->template mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu index 38af4254f922..0ea32e348c0b 100644 --- a/modules/detectron/upsample_nearest_op.cu +++ b/modules/detectron/upsample_nearest_op.cu @@ -164,6 +164,8 @@ bool UpsampleNearestOp::RunOnDevice() { upscale<<>>( input_data, output_data, no_elements, scale_, d1, d2, d3); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return true; } @@ -209,6 +211,7 @@ bool UpsampleNearestGradientOp::RunOnDevice() { math::Set(no_elements, 0.f, gradInput_data, &context_); downscale<<>>( gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/test/cpp_extensions/cuda_extension.cu b/test/cpp_extensions/cuda_extension.cu index 29511af8a0ed..fb3bbd178c07 100644 --- a/test/cpp_extensions/cuda_extension.cu +++ b/test/cpp_extensions/cuda_extension.cu @@ -26,4 +26,5 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel.cu b/test/cpp_extensions/cuda_extension_kernel.cu index 660219989863..c8dce124f9df 100644 --- a/test/cpp_extensions/cuda_extension_kernel.cu +++ b/test/cpp_extensions/cuda_extension_kernel.cu @@ -20,4 +20,5 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel2.cu b/test/cpp_extensions/cuda_extension_kernel2.cu index 817bdf64ac8e..4cdc25cc0110 100644 --- a/test/cpp_extensions/cuda_extension_kernel2.cu +++ b/test/cpp_extensions/cuda_extension_kernel2.cu @@ -20,4 +20,5 @@ void tanh_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; tanh_add_kernel<<>>(x, y, output, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/torch/lib/c10d/test/CUDATest.cu b/torch/lib/c10d/test/CUDATest.cu index c47b29ea536d..88f87492206c 100644 --- a/torch/lib/c10d/test/CUDATest.cu +++ b/torch/lib/c10d/test/CUDATest.cu @@ -17,6 +17,7 @@ __global__ void waitClocks(const uint64_t count) { void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) { waitClocks<<<1, 1, 0, stream.stream()>>>(clocks); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } int cudaNumDevices() { From fc0a3a1787ce3fcc7846665ce12805b006c18231 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Wed, 9 Dec 2020 12:36:34 -0800 Subject: [PATCH 082/250] Improve torch.fft n-dimensional transforms (#46911) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/46911 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25420647 Pulled By: mruberry fbshipit-source-id: bf7e6a2ec41f9f95ffb05c128ee0f3297e34aae2 --- aten/src/ATen/native/SpectralOps.cpp | 126 ++-------- aten/src/ATen/native/cuda/SpectralOps.cu | 254 +++++++++++++++++++++ aten/src/ATen/native/mkl/SpectralOps.cpp | 152 +++++++++++- aten/src/ATen/native/native_functions.yaml | 24 ++ test/test_spectral_ops.py | 6 +- tools/autograd/derivatives.yaml | 9 + tools/autograd/gen_variable_type.py | 1 + torch/csrc/autograd/FunctionsManual.cpp | 66 ++++++ torch/csrc/autograd/FunctionsManual.h | 3 + 9 files changed, 535 insertions(+), 106 deletions(-) diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index d1fabaec6093..23e2caa71509 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -119,25 +119,12 @@ Tensor fft_c2r(Tensor input, c10::optional n_opt, if (n_opt) { input = resize_fft_input(input, dim, n/2 + 1); } - // _fft only operates on the last dim, so transpose the selected dim to the end - const bool must_transpose = (dim != input_dim - 1); - if (must_transpose) { - input = at::transpose(input, -1, dim); - } const auto norm = norm_from_string(norm_str, forward); if (forward) { // FIXME: _fft does not support complex_output=false with inverse=false input = at::conj(input); } - auto out = _fft(at::view_as_real(input), - /*signal_ndim=*/1, /*complex_input=*/true, - /*complex_output=*/false, /*inverse=*/true, - /*signal_sizes=*/{n}, /*normalization=*/norm, - /*onesided=*/true); - if (must_transpose) { - out = at::transpose(out, -1, dim); - } - return out; + return at::_fft_c2r(input, dim, static_cast(norm), n); } // Real to complex FFT @@ -153,22 +140,11 @@ Tensor fft_r2c(Tensor input, c10::optional n_opt, if (n_opt) { input = resize_fft_input(input, dim, n); } - // _fft only operates on the last dim, so transpose the selected dim to the end - const bool must_transpose = (dim != input_dim - 1); - if (must_transpose) { - input = at::transpose(input, -1, dim); - } + const auto norm = norm_from_string(norm_str, forward); - auto out = _fft(input, /*signal_ndim=*/1, /*complex_input=*/false, - /*complex_output=*/true, /*inverse=*/false, - /*signal_sizes=*/{n}, /*normalization=*/norm, - /*onesided=*/onesided); - out = at::view_as_complex(out); - if (must_transpose) { - out = at::transpose(out, -1, dim); - } + auto out = at::_fft_r2c(input, dim, static_cast(norm), onesided); if (!forward) { - // FIXME: _fft does not support complex_input=false with inverse=true + // FIXME: _fft_r2c doesn't support native r2c IFFT out = at::conj(out); } return out; @@ -186,22 +162,8 @@ Tensor fft_c2c(Tensor input, c10::optional n_opt, if (n_opt) { input = resize_fft_input(input, dim, n); } - // _fft only operates on the last dim, so transpose the selected dim to the end - const bool must_transpose = (dim != input_dim - 1); - if (must_transpose) { - input = at::transpose(input, -1, dim); - } const auto norm = norm_from_string(norm_str, forward); - auto out = _fft(at::view_as_real(input), - /*signal_ndim=*/1, /*complex_input=*/true, - /*complex_output=*/true, /*inverse=*/!forward, - /*signal_sizes=*/{}, /*normalization=*/norm, - /*onesided=*/false); - out = at::view_as_complex(out); - if (must_transpose) { - out = at::transpose(out, -1, dim); - } - return out; + return at::_fft_c2c(input, dim, static_cast(norm), forward); } // Dimensions to transform, and the signal shape in those dimensions @@ -277,44 +239,12 @@ Tensor fftn_c2c( const Tensor& input, IntArrayRef shape, IntArrayRef dim, c10::optional norm_str, bool forward) { TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); - const auto input_dim = input.dim(); - Tensor x = resize_fft_input(input, dim, shape); - x = at::view_as_real(x); - - const int64_t transform_ndim = dim.size(); const auto norm = norm_from_string(norm_str, forward); - // _fft_with_size only supports 3 dimensions being transformed at a time. - // This limit is inherited from cuFFT. - constexpr int64_t max_signal_ndim = 3; - - // Transform n dimensions, up to 3 at a time - // TODO: rewrite _fft_with_size to transform more than 3 dimensions at once. - for (int64_t i = 0; i < transform_ndim; i += max_signal_ndim) { - const int64_t signal_ndim = std::min(transform_ndim - i, max_signal_ndim); - DimVector source_dim(signal_ndim); - DimVector dest_dim(signal_ndim); - - for (int64_t j = 0; j < signal_ndim; ++j) { - source_dim[j] = dim[i + j]; - dest_dim[j] = j + (input_dim - signal_ndim); - } - - // _fft operates on up-to the last 3 dims, so move selected dims to the end - x = at::movedim(x, source_dim, dest_dim); - - x = _fft(x, signal_ndim, /*complex_input=*/true, /*complex_output=*/true, - /*inverse=*/!forward, /*signal_sizes=*/{}, /*normalization=*/norm, - /*onesided=*/false); - - // Move transform dims back to their original order - x = at::movedim(x, dest_dim, source_dim); - } - - return at::view_as_complex(x); + return at::_fft_c2c(x, dim, static_cast(norm), forward); } -} +} // namespace (anonymous) // torch.fft.fft, analogous to NumPy's numpy.fft.fft Tensor fft_fft(const Tensor& self, c10::optional n, int64_t dim, @@ -370,44 +300,36 @@ Tensor fft_ifftn(const Tensor& self, c10::optional s, Tensor fft_rfftn(const Tensor& self, c10::optional s, c10::optional dim, - c10::optional norm) { + c10::optional norm_str) { + TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type()); auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis"); - - const auto last_dim = desc.dim.back(); - const auto last_shape = desc.shape.back(); - desc.shape.pop_back(); - desc.dim.pop_back(); - - // rfft on last dim to get hermitian complex shape - auto x = native::fft_rfft(self, last_shape, last_dim, norm); - // Normal fft on remaining dims - return fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/true); + Tensor input = promote_tensor_fft(self, /*require_complex=*/false); + Tensor x = resize_fft_input(input, desc.dim, desc.shape); + const auto norm = norm_from_string(norm_str, /*forward=*/true); + return at::_fft_r2c(x, desc.dim, static_cast(norm), /*onesided=*/true); } Tensor fft_irfftn(const Tensor& self, c10::optional s, c10::optional dim, - c10::optional norm) { + c10::optional norm_str) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis"); - const auto last_dim = desc.dim.back(); - const auto last_shape = [&]() -> c10::optional { - // If shape is defaulted in the last dimension, - // pass nullopt to irfft and let it calculate the default size + const auto last_dim_size = [&] { + // Fixup default shape handling in the last dimension, if (!s.has_value() || (s->back() == -1)) { - return c10::nullopt; + const auto last_dim = desc.dim.back(); + return 2 * (self.sizes()[last_dim] - 1); } return desc.shape.back(); }(); - desc.shape.pop_back(); - desc.dim.pop_back(); - - // Normal ifft for all but last dim - Tensor x = promote_tensor_fft(self, /*require_complex=*/true); - x = fftn_c2c(x, desc.shape, desc.dim, norm, /*forward=*/false); - // Then 1d irfft on last dim to get real output - return native::fft_irfft(x, last_shape, last_dim, norm); + desc.shape.back() = last_dim_size / 2 + 1; + + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + Tensor x = resize_fft_input(input, desc.dim, desc.shape); + const auto norm = norm_from_string(norm_str, /*forward=*/false); + return at::_fft_c2r(x, desc.dim, static_cast(norm), last_dim_size); } Tensor fft_fft2(const Tensor& self, c10::optional s, diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index 15f0eec4d7a2..de807c8c5300 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -335,6 +335,260 @@ void cufft_clear_plan_cache_impl(int64_t device_index) { } // namespace at::native::detail +namespace { +constexpr int64_t cufft_max_ndim = 3; + +// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) +static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, + IntArrayRef dim, bool forward) { + const auto ndim = self.dim(); + const int64_t signal_ndim = dim.size(); + const auto batch_dims = ndim - signal_ndim; + + // Permute dimensions so batch dimensions come first, and in stride order + // This maximizes data locality when collapsing to a single batch dimension + DimVector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), int64_t{0}); + + c10::SmallVector is_transformed_dim(ndim); + for (const auto& d : dim) { + is_transformed_dim[d] = true; + } + auto batch_end = std::partition(dim_permute.begin(), dim_permute.end(), + [&](int64_t d) {return !is_transformed_dim[d]; }); + auto self_strides = self.strides(); + std::sort(dim_permute.begin(), batch_end, + [&](int64_t a, int64_t b) { return self_strides[a] > self_strides[b]; }); + std::copy(dim.cbegin(), dim.cend(), batch_end); + auto input = self.permute(dim_permute); + + // Collapse batch dimensions into a single dimension + DimVector batched_sizes(signal_ndim + 1); + batched_sizes[0] = -1; + std::copy(input.sizes().cbegin() + batch_dims, input.sizes().cend(), batched_sizes.begin() + 1); + input = input.reshape(batched_sizes); + + const auto batch_size = input.sizes()[0]; + DimVector signal_size(signal_ndim + 1); + signal_size[0] = batch_size; + for (int64_t i = 0; i < signal_ndim; ++i) { + auto in_size = input.sizes()[i + 1]; + auto out_size = out_sizes[dim[i]]; + signal_size[i + 1] = std::max(in_size, out_size); + TORCH_INTERNAL_ASSERT(in_size == signal_size[i + 1] || + in_size == (signal_size[i + 1] / 2) + 1); + TORCH_INTERNAL_ASSERT(out_size == signal_size[i + 1] || + out_size == (signal_size[i + 1] / 2) + 1); + } + + batched_sizes[0] = batch_size; + DimVector batched_out_sizes(batched_sizes.begin(), batched_sizes.end()); + for (size_t i = 0; i < dim.size(); ++i) { + batched_out_sizes[i + 1] = out_sizes[dim[i]]; + } + out.resize_(batched_out_sizes, MemoryFormat::Contiguous); + + // Create the transform plan (either from cache or locally) + const auto value_type = c10::toValueType(input.scalar_type()); + auto fft_type = GetCuFFTTransformType(input.is_complex(), out.is_complex()); + CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type); + CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index()); + std::unique_lock guard(plan_cache.mutex, std::defer_lock); + c10::optional uncached_plan; + const CuFFTConfig * config = nullptr; + + if (plan_cache.max_size() > 0) { + guard.lock(); + if (plan_cache.max_size() > 0) { // check again after acquiring the lock + config = &plan_cache.lookup(Params); + } + } + + if (config == nullptr) { + uncached_plan.emplace(Params); + config = &uncached_plan.value(); + } + + auto & plan = config->plan(); + + if (config->should_clone_input()) { + input = input.clone(MemoryFormat::Contiguous); + } + + // prepare cufft for execution + CUFFT_CHECK(cufftSetStream(plan, at::cuda::getCurrentCUDAStream())); + auto workspace = at::empty({ config->workspace_size() }, at::device(at::kCUDA).dtype(at::kByte)); + CUFFT_CHECK(cufftSetWorkArea(plan, workspace.data_ptr())); + + // execute transform plan + exec_cufft_plan(*config, input.data_ptr(), out.data_ptr(), forward); + + // Inplace reshaping to original batch shape and inverting the dimension permutation + DimVector out_strides(ndim); + int64_t batch_numel = 1; + for (int64_t i = batch_dims - 1; i >= 0; --i) { + out_strides[dim_permute[i]] = batch_numel * out.strides()[0]; + batch_numel *= out_sizes[dim_permute[i]]; + } + for (int64_t i = batch_dims; i < ndim; ++i) { + out_strides[dim_permute[i]] = out.strides()[1 + (i - batch_dims)]; + } + return out.as_strided_(out_sizes, out_strides, out.storage_offset()); +} + +// Calculates the normalization constant and applies it in-place to self +// sizes is the sizes of a twosided tensor and dims are all transformed dims +void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { + auto norm = static_cast(normalization); + if (norm == fft_norm_mode::none) { + return; + } + + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; + } + const double scale_denom = (norm == fft_norm_mode::by_root_n) ? + std::sqrt(signal_numel) : static_cast(signal_numel); + self.div_(scale_denom); +} + +} // namespace (anonymous) + +// n-dimensional real to complex FFT +Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { + TORCH_CHECK(self.is_floating_point()); + auto input_sizes = self.sizes(); + DimVector onesided_sizes(input_sizes.begin(), input_sizes.end()); + auto last_dim = dim.back(); + auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1; + onesided_sizes[last_dim] = last_dim_halfsize; + IntArrayRef out_sizes = onesided ? onesided_sizes : input_sizes; + + const auto out_options = self.options().dtype(c10::toComplexType(self.scalar_type())); + auto output = at::empty(out_sizes, out_options); + + // CuFFT requires real input to be over-aligned, as if it were complex + const auto complex_size = 2 * self.element_size(); + const bool complex_aligned = ( + reinterpret_cast(self.data_ptr()) % complex_size == 0); + auto working_tensor = self; + if (!complex_aligned) { + working_tensor = self.movedim(last_dim, -1) + .clone(MemoryFormat::Contiguous) + .movedim(-1, last_dim); + } + + // First do the R2C transform on the last dimension + { + auto target_sizes = dim.size() == 1 ? out_sizes : onesided_sizes; + _exec_fft(output, working_tensor, target_sizes, last_dim, /*forward=*/true); + if (dim.size() > 1) { + working_tensor = at::empty(out_sizes, out_options); + } + } + + // Then any remaining C2C transforms + DimVector sorted_dims(dim.begin(), dim.end() - 1); + while (!sorted_dims.empty()) { + std::swap(output, working_tensor); + + // Resort dimensions every time as _exec_fft re-strides the output + auto strides = working_tensor.strides(); + std::sort(sorted_dims.begin(), sorted_dims.end(), + [&](int64_t a, int64_t b) { return strides[a] > strides[b]; }); + + const auto max_dims = std::min(static_cast(cufft_max_ndim), sorted_dims.size()); + auto last_dims = IntArrayRef(sorted_dims).slice(sorted_dims.size() - max_dims, max_dims); + + // Intermediate results are always onesided + _exec_fft(output, working_tensor, onesided_sizes, last_dims, /*forward=*/true); + sorted_dims.resize(sorted_dims.size() - max_dims); + } + + // Only need to normalize the onesided slice since data in the other half is overwritten + auto out_slice = output.slice(last_dim, 0, last_dim_halfsize); + _fft_apply_normalization(out_slice, normalization, input_sizes, dim); + + if (!onesided) { + if (output.sizes()[last_dim] != out_sizes[last_dim]) { + working_tensor.resize_(out_sizes, MemoryFormat::Contiguous); + working_tensor.slice(last_dim, 0, last_dim_halfsize).copy_(output); + output = std::move(working_tensor); + } + at::native::_fft_fill_with_conjugate_symmetry_(output, dim); + } + return output; +} + +// n-dimensional complex to real IFFT +Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) { + TORCH_CHECK(self.is_complex()); + auto in_sizes = self.sizes(); + DimVector out_sizes(in_sizes.begin(), in_sizes.end()); + out_sizes[dim.back()] = lastdim; + + // First complete any C2C transforms + Tensor temp; + if (dim.size() > 1) { + temp = _fft_c2c_cufft( + self, dim.slice(0, dim.size() - 1), + static_cast(fft_norm_mode::none), /*forward=*/false); + } else { + // Complex to real FFTs may overwrite the input buffer, so must always clone (gh-34551) + temp = self.clone(MemoryFormat::Contiguous); + } + + // Finally, do a 1D C2R transform + // TODO: could transform up to 2 other dims in the same cuFFT operation + auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false); + _fft_apply_normalization(output, normalization, out_sizes, dim); + return output; +} + +// n-dimensional complex to complex FFT/IFFT +Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { + TORCH_CHECK(self.is_complex()); + if (dim.empty()) { + return self.clone(); + } + + auto out_sizes = self.sizes(); + auto output = at::empty(out_sizes, self.options()); + + // Perform any number of C2C transforms + DimVector sorted_dims(dim.begin(), dim.end()); + auto self_strides = self.strides(); + auto working_tensor = self; + while (true) { + // Sort dimensions every time as _exec_fft re-strides the output + auto strides = working_tensor.strides(); + std::sort(sorted_dims.begin(), sorted_dims.end(), + [&](int64_t a, int64_t b) { return strides[a] > strides[b]; }); + + const auto max_dims = std::min(static_cast(cufft_max_ndim), sorted_dims.size()); + auto first_dims = IntArrayRef(sorted_dims).slice(sorted_dims.size() - max_dims, max_dims); + + _exec_fft(output, working_tensor, out_sizes, first_dims, forward); + sorted_dims.resize(sorted_dims.size() - max_dims); + + if (sorted_dims.empty()) { + break; + } + + if (working_tensor.is_same(self)) { + working_tensor = std::move(output); + output = at::empty(out_sizes, self.options()); + } else { + std::swap(output, working_tensor); + } + } + + _fft_apply_normalization(output, normalization, out_sizes, dim); + return output; +} + // cuFFT // Currently not utilizing multi GPUs so this can be potentially sped up. Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim, diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index 612be3b00f80..9584fafcea4b 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -17,6 +17,18 @@ Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim, AT_ERROR("fft: ATen not compiled with MKL support"); } +Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + +Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + +Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + }} #else // AT_MKL_ENABLED @@ -61,7 +73,7 @@ void _fft_fill_with_conjugate_symmetry_slice( // We explicitly loop over one row, then use this lambda to iterate over // n-dimensions. This advances iter_index by one row, while updating in_ptr // and out_ptr to point to the new row of data. - auto advance_index = [&] { + auto advance_index = [&] () __ubsan_ignore_undefined__ { for (size_t i = 1; i < iter_index.size(); ++i) { if (iter_index[i] + 1 < signal_half_sizes[i]) { ++iter_index[i]; @@ -359,6 +371,144 @@ Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, return output; } +// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) +static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, + IntArrayRef dim, int64_t normalization, bool forward) { + const auto ndim = self.dim(); + const int64_t signal_ndim = dim.size(); + const auto batch_dims = ndim - signal_ndim; + + // Permute dimensions so batch dimensions come first, and in stride order + // This maximizes data locality when collapsing to a single batch dimension + DimVector dim_permute(ndim); + std::iota(dim_permute.begin(), dim_permute.end(), int64_t{0}); + + c10::SmallVector is_transformed_dim(ndim); + for (const auto& d : dim) { + is_transformed_dim[d] = true; + } + auto batch_end = std::partition(dim_permute.begin(), dim_permute.end(), + [&](int64_t d) {return !is_transformed_dim[d]; }); + auto self_strides = self.strides(); + std::sort(dim_permute.begin(), batch_end, + [&](int64_t a, int64_t b) { return self_strides[a] > self_strides[b]; }); + std::copy(dim.cbegin(), dim.cend(), batch_end); + auto input = self.permute(dim_permute); + + // Collapse batch dimensions into a single dimension + DimVector batched_sizes(signal_ndim + 1); + batched_sizes[0] = -1; + std::copy(input.sizes().cbegin() + batch_dims, input.sizes().cend(), batched_sizes.begin() + 1); + input = input.reshape(batched_sizes); + + const auto batch_size = input.sizes()[0]; + DimVector signal_size(signal_ndim + 1); + signal_size[0] = batch_size; + for (int64_t i = 0; i < signal_ndim; ++i) { + auto in_size = input.sizes()[i + 1]; + auto out_size = out_sizes[dim[i]]; + signal_size[i + 1] = std::max(in_size, out_size); + TORCH_INTERNAL_ASSERT(in_size == signal_size[i + 1] || + in_size == (signal_size[i + 1] / 2) + 1); + TORCH_INTERNAL_ASSERT(out_size == signal_size[i + 1] || + out_size == (signal_size[i + 1] / 2) + 1); + } + + batched_sizes[0] = batch_size; + DimVector batched_out_sizes(batched_sizes.begin(), batched_sizes.end()); + for (size_t i = 0; i < dim.size(); ++i) { + batched_out_sizes[i + 1] = out_sizes[dim[i]]; + } + + const auto value_type = c10::toValueType(input.scalar_type()); + out.resize_(batched_out_sizes, MemoryFormat::Contiguous); + + auto descriptor = _plan_mkl_fft( + input.strides(), out.strides(), signal_size, input.is_complex(), + out.is_complex(), normalization, forward, value_type); + + // run the FFT + if (forward) { + MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), input.data_ptr(), out.data_ptr())); + } else { + MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), input.data_ptr(), out.data_ptr())); + } + + // Inplace reshaping to original batch shape and inverting the dimension permutation + DimVector out_strides(ndim); + int64_t batch_numel = 1; + for (int64_t i = batch_dims - 1; i >= 0; --i) { + out_strides[dim_permute[i]] = batch_numel * out.strides()[0]; + batch_numel *= out_sizes[dim_permute[i]]; + } + for (int64_t i = batch_dims; i < ndim; ++i) { + out_strides[dim_permute[i]] = out.strides()[1 + (i - batch_dims)]; + } + return out.as_strided_(out_sizes, out_strides, out.storage_offset()); +} + +// Sort transform dimensions by input layout, for best performance +// exclude_last is for onesided transforms where the last dimension cannot be reordered +static DimVector _sort_dims(const Tensor& self, IntArrayRef dim, bool exclude_last=false) { + DimVector sorted_dims(dim.begin(), dim.end()); + auto self_strides = self.strides(); + std::sort(sorted_dims.begin(), sorted_dims.end() - exclude_last, + [&](int64_t a, int64_t b) { return self_strides[a] > self_strides[b]; }); + return sorted_dims; +} + +// n-dimensional complex to real IFFT +Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) { + TORCH_CHECK(self.is_complex()); + // NOTE: Multi-dimensional C2R transforms don't agree with numpy in cases + // where the input isn't strictly Hermitian-symmetric. Instead, we use a + // multi-dim C2C transform followed by a 1D C2R transform. + // + // Such inputs are technically out of contract though, so maybe a disagreement + // is okay. + auto input = self; + if (dim.size() > 1) { + auto c2c_dims = dim.slice(0, dim.size() - 1); + input = _fft_c2c_mkl(self, c2c_dims, normalization, /*foward=*/false); + dim = dim.slice(dim.size() - 1); + } + + auto in_sizes = input.sizes(); + DimVector out_sizes(in_sizes.begin(), in_sizes.end()); + out_sizes[dim.back()] = last_dim_size; + auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false); +} + +// n-dimensional real to complex FFT +Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { + TORCH_CHECK(self.is_floating_point()); + auto input_sizes = self.sizes(); + DimVector out_sizes(input_sizes.begin(), input_sizes.end()); + auto last_dim = dim.back(); + auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1; + if (onesided) { + out_sizes[last_dim] = last_dim_halfsize; + } + + auto sorted_dims = _sort_dims(self, dim, /*exclude_last=*/true); + auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type()))); + _exec_fft(out, self, out_sizes, sorted_dims, normalization, /*forward=*/true); + + if (!onesided) { + at::native::_fft_fill_with_conjugate_symmetry_(out, dim); + } + return out; +} + +// n-dimensional complex to complex FFT/IFFT +Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { + TORCH_CHECK(self.is_complex()); + const auto sorted_dims = _sort_dims(self, dim); + auto out = at::empty(self.sizes(), self.options()); + return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward); +} + }} // namespace at::native #endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index e7ac20599214..a8bae757ab42 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2058,6 +2058,30 @@ CPU: _fft_mkl CUDA: _fft_cufft +# Real to complex forward FFT +- func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor + use_c10_dispatcher: full + variants: function + dispatch: + CPU: _fft_r2c_mkl + CUDA: _fft_r2c_cufft + +# Complex to real inverse FFT +- func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor + use_c10_dispatcher: full + variants: function + dispatch: + CPU: _fft_c2r_mkl + CUDA: _fft_c2r_cufft + +# Standard complex to complex FFT (forward or backward) +- func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor + use_c10_dispatcher: full + variants: function + dispatch: + CPU: _fft_c2c_mkl + CUDA: _fft_c2c_cufft + - func: _cufft_get_plan_cache_size(int device_index) -> int use_c10_dispatcher: full diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index 9310a6448ef5..04365a5828d4 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -228,7 +228,7 @@ def test_fft_invalid_dtypes(self, device): with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): torch.fft.rfft(t) - with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input tensor"): torch.fft.rfftn(t) with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): @@ -479,7 +479,7 @@ def test_fftn_invalid(self, device): func(a, s=(10, 10, 10, 10)) c = torch.complex(a, a) - with self.assertRaisesRegex(RuntimeError, "Expected a real input"): + with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input"): torch.fft.rfftn(c) # 2d-fft tests @@ -591,7 +591,7 @@ def test_fft2_invalid(self, device): func(a, dim=(2, 3)) c = torch.complex(a, a) - with self.assertRaisesRegex(RuntimeError, "Expected a real input"): + with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input"): torch.fft.rfft2(c) # Helper functions diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index dadfe6018939..b88596c2b609 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -1832,6 +1832,15 @@ - name: _fft_with_size.norm_modes(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, int normalization, bool onesided, int[] output_sizes) -> Tensor self: fft_backward(self, grad, signal_ndim, complex_input, complex_output, inverse, checked_signal_sizes, normalization, onesided, output_sizes) +- name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor + self: fft_r2c_backward(grad, dim, normalization, onesided, self.size(dim.back())) + +- name: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor + self: fft_c2r_backward(grad, dim, normalization) + +- name: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor + self: _fft_c2c(grad, dim, normalization, !forward) + - name: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[] self: unbind_backward(grads, dim) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 4948ac3af0dc..5431980a51c3 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -79,6 +79,7 @@ 'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_', 'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv', 'matrix_exp', 'linalg_eigh', 'cholesky_solve', + '_fft_c2c', '_fft_r2c', } # Some operators invalidate the grad_accumulator. Let's reset it. diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 91bad195f47e..e46c08cfecc7 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -2412,6 +2412,72 @@ Tensor fft_backward(const Tensor& self, const Tensor& grad, int64_t signal_ndim, return gI; } +Tensor fft_c2r_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization) { + // Forward is C2R (onesided) + // Think of onesided C2R irfft as + // 1. fill the other half by conjugate symmetry + // 2. inverse C2C ifft + // 3. discard the complex dimension + // So backward is + // 1. R2C rfft (essentially add dummy complex dimension, and dft) + // 2. accumulate gradient by conjugate symmetry + // since rfft results follow conjugate symmetry, we only need to + // double some entries from onesided rfft results, i.e., the ones with + // their reflected indices also landing out of the onesided range. So + // consider the index of last dim: + // i. idx = 0. + // Reflected to (N - 0) % N = 0. Not doubled. + // ii 0 < idx < floor(N/2) (last). + // N > N - idx > ceil(N/2) + // Reflected to () + // iii. idx = floor(N/2) = N/2 (last) when N even. + // Reflected to (N - N/2) % N = N/2. Not doubled. + // iv. idx = floor(N/2) = (N-1)/2 (last) when N odd. + // Reflected to (N - (N-1)/2) % N = (N+1)/2. Doubled. + // Therefore, needs to double + // idx = 1, 2, ..., N/2 - 1 when N even + // idx = 1, 2, ..., (N-1)/2 when N odd + // that is + // idx = 1, 2, ..., N - (floor(N/2) + 1) + // = 1, 2, ..., N - onesided_length + auto gI = at::_fft_r2c(grad, dim, normalization, /*onesided=*/true); + + auto double_length = grad.size(dim.back()) - gI.size(dim.back()); + if (double_length > 0) { // also covers case when signal size is zero + gI.narrow(dim.back(), 1, double_length).mul_(2); + } + return gI; +} + +Tensor fft_r2c_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization, + bool onesided, int64_t last_dim_size) { + if (!onesided) { + return at::real(at::_fft_c2c(grad, dim, normalization, /*forward=*/false)); + } + + // Forward is R2C (onesided) + // Think of onesided R2C rfft as + // 1. view as complex numbers (fill complex dim with zeros) + // 2. C2C fft + // 3. discard half of results + // So backward is + // 1. fill the other half with zeros (with `zero_grad_shape` below) + // (C2C ifft only take twosided inputs so we need to fill here) + // 2. inverse C2C ifft + // 3. discard the complex dim + auto half_sizes = grad.sizes(); + at::DimVector new_grad_shape(half_sizes.begin(), half_sizes.end()); + const auto last_dim = at::maybe_wrap_dim(dim.back(), half_sizes.size()); + new_grad_shape[last_dim] = last_dim_size; + + const auto zero_length = last_dim_size - grad.size(dim.back()); + auto complex_full_grad = zero_length > 0 ? at::zeros(new_grad_shape, grad.options()) : grad; + if (zero_length > 0) { + complex_full_grad.slice(last_dim, 0, half_sizes[last_dim]).copy_(grad); + } + return at::real(at::_fft_c2c(complex_full_grad, dim, normalization, /*forward=*/false)); +} + // Helper for batchnorm_double_backward Tensor sum_exclude_dim1(const Tensor& to_sum, bool keepdim=true) { auto r = to_sum.sum(0, keepdim); diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index 0fba31bdd894..73d0789dd3e6 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -160,6 +160,9 @@ Tensor fft_backward(const Tensor& self, const Tensor& grad, int64_t signal_ndim, bool inverse, IntArrayRef checked_signal_sizes, int64_t normalization, bool onesided, IntArrayRef output_sizes); +Tensor fft_r2c_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization, + bool onesided, int64_t last_dim_size); +Tensor fft_c2r_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization); Tensor constant_pad_nd_backward(const Tensor& grad, IntArrayRef pad); std::tuple cholesky_solve_backward( const Tensor& grad_x, const Tensor& self, From 7a4a2df2254b78d8c8d42b9f81b5b261a617466e Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Wed, 9 Dec 2020 13:55:56 -0800 Subject: [PATCH 083/250] Revert D25003113: make validate debug-only in Device copy ctr Test Plan: revert-hammer Differential Revision: D25003113 (https://github.com/pytorch/pytorch/commit/4b26cafb8fa7eef7cbfdc0327f85f30e0a38e8ec) Original commit changeset: e17e6495db65 fbshipit-source-id: fd636c954a97bd80892464feb974a11b9dd96899 --- c10/core/Device.h | 8 ++------ test/test_torch.py | 4 ++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/c10/core/Device.h b/c10/core/Device.h index 04cd711c37b2..7827119bb0ac 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -93,13 +93,9 @@ struct C10_API Device final { DeviceType type_; DeviceIndex index_ = -1; void validate() { - // Removing these checks in release builds noticeably improves - // performance in micro-benchmarks. - // This is safe to do, because backends that use the DeviceIndex - // have a later check when we actually try to switch to that device. - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(index_ == -1 || index_ >= 0, + TORCH_CHECK(index_ == -1 || index_ >= 0, "Device index must be -1 or non-negative, got ", (int)index_); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_cpu() || index_ <= 0, + TORCH_CHECK(!is_cpu() || index_ <= 0, "CPU device index must be -1 or zero, got ", (int)index_); } }; diff --git a/test/test_torch.py b/test/test_torch.py index 4b4e28583d02..ad88128617c9 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -341,6 +341,9 @@ def test_device(self): self.assertEqual(90, cuda90.index) self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1')) + self.assertRaises(RuntimeError, lambda: torch.device('cpu:1')) + self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1)) + self.assertRaises(RuntimeError, lambda: torch.device('cpu', 1)) self.assertRaises(RuntimeError, lambda: torch.device('cuda:-1')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2 ')) self.assertRaises(RuntimeError, lambda: torch.device('cuda: 2')) @@ -353,6 +356,7 @@ def test_device(self): self.assertRaises(RuntimeError, lambda: torch.device('cuda:2 cuda:3')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2+cuda:3')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2cuda:3')) + self.assertRaises(RuntimeError, lambda: torch.device('cuda', -1)) self.assertRaises(RuntimeError, lambda: torch.device(-1)) self.assertRaises(RuntimeError, lambda: torch.device('other')) From bfa95f90a039a125c55305c7ac49e76620fc9983 Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Wed, 9 Dec 2020 14:05:11 -0800 Subject: [PATCH 084/250] Revert D25325039: Check CUDA kernel launches (/fbcode/caffe2/) Test Plan: revert-hammer Differential Revision: D25325039 (https://github.com/pytorch/pytorch/commit/f5e9ffbc279626ad8cabda49eed91dbe6399d3c4) Original commit changeset: 2043d6e63c7d fbshipit-source-id: 5377dd2aa7c6f58c8641c956b7642c7c559bbc40 --- modules/detectron/group_spatial_softmax_op.cu | 3 --- modules/detectron/ps_roi_pool_op.cu | 2 -- modules/detectron/roi_pool_f_op.cu | 2 -- modules/detectron/select_smooth_l1_loss_op.cu | 2 -- modules/detectron/sigmoid_cross_entropy_loss_op.cu | 5 ----- modules/detectron/sigmoid_focal_loss_op.cu | 2 -- modules/detectron/smooth_l1_loss_op.cu | 3 --- modules/detectron/softmax_focal_loss_op.cu | 5 ----- modules/detectron/spatial_narrow_as_op.cu | 2 -- modules/detectron/upsample_nearest_op.cu | 3 --- test/cpp_extensions/cuda_extension.cu | 1 - test/cpp_extensions/cuda_extension_kernel.cu | 1 - test/cpp_extensions/cuda_extension_kernel2.cu | 1 - torch/lib/c10d/test/CUDATest.cu | 1 - 14 files changed, 33 deletions(-) diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu index a37a3fba55a7..92e89ae5acc2 100644 --- a/modules/detectron/group_spatial_softmax_op.cu +++ b/modules/detectron/group_spatial_softmax_op.cu @@ -112,7 +112,6 @@ bool GroupSpatialSoftmaxOp::RunOnDevice() { GroupSpatialSoftmaxKernel<<>>( N, A, W, H, Xdata, Pdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -159,13 +158,11 @@ bool GroupSpatialSoftmaxGradientOp::RunOnDevice() { SumProbsKernel<<>>( N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 2: dX[i] = dX[i] - s SubSumKernel<<>>( N, A, W, H, sum_probs_.data(), dXdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 3: dX[i] = Y[i] * dX[i] math::Mul(Y.size(), dXdata, Ydata, dXdata, &context_); diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu index 68e4ec377d62..1ba418be5c99 100644 --- a/modules/detectron/ps_roi_pool_op.cu +++ b/modules/detectron/ps_roi_pool_op.cu @@ -253,7 +253,6 @@ bool PSRoIPoolOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), output_dim_, group_size_, Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -277,7 +276,6 @@ bool PSRoIPoolGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, output_dim_, dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu index b261911b95a1..62948f7eacbe 100644 --- a/modules/detectron/roi_pool_f_op.cu +++ b/modules/detectron/roi_pool_f_op.cu @@ -149,7 +149,6 @@ bool RoIPoolFOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -174,7 +173,6 @@ bool RoIPoolFGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; } diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu index ce68fcff634d..9065bfc7afbe 100644 --- a/modules/detectron/select_smooth_l1_loss_op.cu +++ b/modules/detectron/select_smooth_l1_loss_op.cu @@ -129,7 +129,6 @@ bool SelectSmoothL1LossOp::RunOnDevice() { M, Y_hat.data(), Y.data(), L.data(), buff_.mutable_data(), S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Sum of all losses // al := sum_i l_i @@ -176,7 +175,6 @@ bool SelectSmoothL1LossGradientOp::RunOnDevice() { D, H, W, M, Y_hat.data(), Y.data(), L.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_, S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu index bb86560fcb01..d69a7b41dc33 100644 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu +++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu @@ -93,8 +93,6 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { T.data(), losses_.mutable_data(), counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - float* avg_loss_data = avg_loss->mutable_data(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -108,7 +106,6 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, avg_loss_data, normalizer_data, avg_loss_data, &context_); } @@ -138,7 +135,6 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { T.data(), dX->mutable_data(), counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); if (normalize_) { float* normalizer_data = normalizer_.mutable_data(); math::Sum( @@ -149,7 +145,6 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, d_avg_loss.data(), diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu index e6f2dea21b5d..5b130c8dfc1f 100644 --- a/modules/detectron/sigmoid_focal_loss_op.cu +++ b/modules/detectron/sigmoid_focal_loss_op.cu @@ -134,7 +134,6 @@ bool SigmoidFocalLossOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), wp.data(), gamma_, alpha_, num_classes_, losses_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -166,7 +165,6 @@ bool SigmoidFocalLossGradientOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), dX->mutable_data(), wp.data(), gamma_, alpha_, num_classes_, d_avg_loss.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu index ea835a4bc2b9..1a3e8b78b53f 100644 --- a/modules/detectron/smooth_l1_loss_op.cu +++ b/modules/detectron/smooth_l1_loss_op.cu @@ -102,7 +102,6 @@ bool SmoothL1LossOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), buff_.mutable_data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Element-wise weighted smooth l1 loss (can be used to specify a per-element // loss weight) @@ -165,8 +164,6 @@ bool SmoothL1LossGradientOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_ / N, beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - // Element-wise scale by alpha_in and alpha_out math::Mul( d_Y_hat->size(), d_Y_hat->data(), alpha_in.data(), diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu index b7f8d2423ebc..93635269f176 100644 --- a/modules/detectron/softmax_focal_loss_op.cu +++ b/modules/detectron/softmax_focal_loss_op.cu @@ -176,7 +176,6 @@ bool SoftmaxFocalLossOp::RunOnDevice() { <<>>( N, A, H, W, Xdata, P->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Compute loss for each x,y location const int* Tdata = T.data(); @@ -185,7 +184,6 @@ bool SoftmaxFocalLossOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, P->data(), Tdata, losses_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // sum the losses float* avg_loss_data = avg_loss->mutable_data(); @@ -229,8 +227,6 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, Pdata, Tdata, buff_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - // Compute the gradient with the weights const float* Bdata = buff_.data(); SoftmaxFocalLossGradientKernel @@ -238,7 +234,6 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data(), dX->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu index ff8b5632e80a..97ddc492eb07 100644 --- a/modules/detectron/spatial_narrow_as_op.cu +++ b/modules/detectron/spatial_narrow_as_op.cu @@ -115,7 +115,6 @@ bool SpatialNarrowAsOp::DoRunWithType() { out_width, A.template data(), C->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -153,7 +152,6 @@ bool SpatialNarrowAsGradientOp::DoRunWithType() { out_width, dC.template data(), dA->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu index 0ea32e348c0b..38af4254f922 100644 --- a/modules/detectron/upsample_nearest_op.cu +++ b/modules/detectron/upsample_nearest_op.cu @@ -164,8 +164,6 @@ bool UpsampleNearestOp::RunOnDevice() { upscale<<>>( input_data, output_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; } @@ -211,7 +209,6 @@ bool UpsampleNearestGradientOp::RunOnDevice() { math::Set(no_elements, 0.f, gradInput_data, &context_); downscale<<>>( gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/test/cpp_extensions/cuda_extension.cu b/test/cpp_extensions/cuda_extension.cu index fb3bbd178c07..29511af8a0ed 100644 --- a/test/cpp_extensions/cuda_extension.cu +++ b/test/cpp_extensions/cuda_extension.cu @@ -26,5 +26,4 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel.cu b/test/cpp_extensions/cuda_extension_kernel.cu index c8dce124f9df..660219989863 100644 --- a/test/cpp_extensions/cuda_extension_kernel.cu +++ b/test/cpp_extensions/cuda_extension_kernel.cu @@ -20,5 +20,4 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel2.cu b/test/cpp_extensions/cuda_extension_kernel2.cu index 4cdc25cc0110..817bdf64ac8e 100644 --- a/test/cpp_extensions/cuda_extension_kernel2.cu +++ b/test/cpp_extensions/cuda_extension_kernel2.cu @@ -20,5 +20,4 @@ void tanh_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; tanh_add_kernel<<>>(x, y, output, size); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/torch/lib/c10d/test/CUDATest.cu b/torch/lib/c10d/test/CUDATest.cu index 88f87492206c..c47b29ea536d 100644 --- a/torch/lib/c10d/test/CUDATest.cu +++ b/torch/lib/c10d/test/CUDATest.cu @@ -17,7 +17,6 @@ __global__ void waitClocks(const uint64_t count) { void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) { waitClocks<<<1, 1, 0, stream.stream()>>>(clocks); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } int cudaNumDevices() { From 67d12c9582fad58820f142e84de1fe2899051d6f Mon Sep 17 00:00:00 2001 From: Oleg Khabinov Date: Wed, 9 Dec 2020 14:06:38 -0800 Subject: [PATCH 085/250] Pass shape hints for AOT case (#48989) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48989 1. Pass shape hints at model export time. 2. A bit of logging to show if passed shape hints are loaded by OnnxifiOp. From jfix71: > for AOT we skip onnxifi on the predictor side. We do onnxifi at model export time Test Plan: Temporarily added extra logging to verify that we use passed shape hints for AOT scenario. Here are the test results: 1. AOT model generation https://fburl.com/paste/1dtxrdsr shows that pybind_state.cc is called. 2. Running predictor service https://fburl.com/paste/d4qcizya with more logging in onnxifi_op.cc D25344546 shows that we use provided shape hints instead of doing shape inference every time. Reviewed By: jfix71 Differential Revision: D25344546 fbshipit-source-id: 799ca4baea23ed4d81d89d00cb3a52a1cbf69a44 --- caffe2/opt/onnxifi_op.h | 7 +++- caffe2/opt/onnxifi_transformer.cc | 53 +++++++++++++++++++++++-------- caffe2/opt/onnxifi_transformer.h | 6 ++++ 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h index eeb93c51e6f8..ce732f7604bc 100644 --- a/caffe2/opt/onnxifi_op.h +++ b/caffe2/opt/onnxifi_op.h @@ -128,6 +128,10 @@ class OnnxifiOp final : public Operator { adjust_quantized_offset_ = 0; } + LOG(INFO) << "use_onnx_=" << use_onnx_ + << ", use_glow_aot_=" << use_glow_aot_ + << ", use_passed_output_shapes_=" << use_passed_output_shapes_; + if (use_passed_output_shapes_) { // Populate output_shapes_per_bs_ for (int bs = 1; bs < max_batch_size_; ++bs) { @@ -145,6 +149,7 @@ class OnnxifiOp final : public Operator { for (output_idx = 0; output_idx < output_names_.size(); ++output_idx) { auto it = name_to_shape.find(output_names_[output_idx]); + CAFFE_ENFORCE(it != name_to_shape.end()); output_shapes_per_bs_[bs].push_back({}); auto &output_shapes = output_shapes_per_bs_[bs].back(); std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes)); @@ -486,7 +491,7 @@ class OnnxifiOp final : public Operator { std::unordered_map input_shape_info_; // Whether we should use passed output shape hints or do shape inference - bool use_passed_output_shapes_{false}; + const bool use_passed_output_shapes_{false}; // Whether we need to resize outputs or not bool adjust_output_batch_{false}; diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc index 9ccc662d99a9..8089314c3100 100644 --- a/caffe2/opt/onnxifi_transformer.cc +++ b/caffe2/opt/onnxifi_transformer.cc @@ -506,6 +506,31 @@ OnnxifiTransformer::~OnnxifiTransformer() { } } +bool OnnxifiTransformer::canPassOutputShapeHintsPerBs( + const OperatorDef& op, + const std::unordered_map& shape_hints_per_bs) const { + if (shape_hints_per_bs.empty()) { + return false; + } + + for (int bs = 1; bs < opts_.bound_shape_spec.max_batch_size; ++bs) { + auto shape_hints_search = shape_hints_per_bs.find(bs); + if (shape_hints_search == shape_hints_per_bs.end()) { + return false; + } + const auto& shape_hints = shape_hints_search->second; + + for (int output_idx = 0; output_idx < op.output_size(); ++output_idx) { + auto shape_hint_search = shape_hints.find(op.output(output_idx)); + if (shape_hint_search == shape_hints.end()) { + return false; + } + } + } + + return true; +} + OperatorDef OnnxifiTransformer::buildOnnxifiOp( const std::string& onnx_model_str, const std::unordered_set& initialization_list, @@ -583,31 +608,31 @@ OperatorDef OnnxifiTransformer::buildOnnxifiOp( } } - // Add output size hints for per batch size - AddArgument("use_passed_output_shapes", shape_hints_per_bs.empty() ? 0 : 1, &op); - if (!shape_hints_per_bs.empty()) { - for (int bs = 1; bs < opts_.bound_shape_spec.max_batch_size; ++bs) { - auto it = shape_hints_per_bs.find(bs); - CAFFE_ENFORCE(it != shape_hints_per_bs.end()); - const auto& shape_hints_current_bs = it->second; + // Add output size hints per batch size + if (canPassOutputShapeHintsPerBs(op, shape_hints_per_bs)) { + VLOG(2) << "Passing in output shape hints for batch sizes in [1, " << opts_.bound_shape_spec.max_batch_size << ")"; + AddArgument("use_passed_output_shapes", 1, &op); + for (int bs = 1; bs < opts_.bound_shape_spec.max_batch_size; ++bs) { auto* output_shape_arg = op.add_arg(); output_shape_arg->set_name("output_shapes_bs_" + caffe2::to_string(bs)); auto* output_qshape_arg = op.add_arg(); output_qshape_arg->set_name("output_qshapes_bs_" + caffe2::to_string(bs)); + const auto& shape_hints = shape_hints_per_bs.find(bs)->second; + for (int output_idx = 0; output_idx < op.output_size(); ++output_idx) { const auto& output_name = op.output(output_idx); - auto it_output = shape_hints_current_bs.find(output_name); - if (it_output != shape_hints_current_bs.end()) { - if (!it_output->second.is_quantized) { - output_shape_arg->mutable_tensors()->Add()->CopyFrom(wrapShapeInfoIntoTensorProto(output_name, it_output->second)); - } else { - output_shape_arg->mutable_qtensors()->Add()->CopyFrom(wrapShapeInfoIntoQTensorProto(output_name, it_output->second)); - } + const auto& shape_hint = shape_hints.find(output_name)->second; + if (!shape_hint.is_quantized) { + output_shape_arg->mutable_tensors()->Add()->CopyFrom(wrapShapeInfoIntoTensorProto(output_name, shape_hint)); + } else { + output_shape_arg->mutable_qtensors()->Add()->CopyFrom(wrapShapeInfoIntoQTensorProto(output_name, shape_hint)); } } } + } else { + AddArgument("use_passed_output_shapes", 0, &op); } // Tell Onnxifi op that the model is in onnx or c2 proto format diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h index 5836486bfd31..d86f112dd485 100644 --- a/caffe2/opt/onnxifi_transformer.h +++ b/caffe2/opt/onnxifi_transformer.h @@ -82,6 +82,12 @@ class CAFFE2_API OnnxifiTransformer final : public BackendTransformerBase { const ShapeInfoMap& shape_hints_max_bs, const std::unordered_map &shape_hints_per_bs); + // Check that output shape hints are present to ensure we can pass them to + // OnnxifiOp + bool canPassOutputShapeHintsPerBs( + const OperatorDef& op, + const std::unordered_map& shape_hints_per_bs) const; + // We already have all the ops and external inputs and outputs! OperatorDef buildOnnxifiOp( const std::string& onnx_model_str, From c7cc8a48c05d651973aa96d08754104e0c6542da Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Wed, 9 Dec 2020 14:40:53 -0800 Subject: [PATCH 086/250] migrating some straggler pytorch ops in fbcode to the new registration API (#48954) Summary: I already migrated the majority of fbcode ops to the new registration API, but there are a few stragglers (mostly new files that were created in the last two weeks). The goal is mostly to stamp out as much of the legacy registration API usage as possible, so that people only see the new API when they look around the code for examples of how to register their own ops. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48954 ghstack-source-id: 118140663 Test Plan: Ran buck targets for each file that I migrated Reviewed By: ezyang Differential Revision: D25380422 fbshipit-source-id: 268139a1d7b9ef14c07befdf9e5a31f15b96a48c --- benchmarks/operator_benchmark/pt_extension/extension.cpp | 7 ++++--- torch/csrc/autograd/record_function_ops.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/benchmarks/operator_benchmark/pt_extension/extension.cpp b/benchmarks/operator_benchmark/pt_extension/extension.cpp index 2e665604c6ed..2dbdfdd8b3e6 100644 --- a/benchmarks/operator_benchmark/pt_extension/extension.cpp +++ b/benchmarks/operator_benchmark/pt_extension/extension.cpp @@ -17,9 +17,10 @@ List consume_list(List a) { // That caused an issue for our op benchmark which needs to run an op // in a loop and report the execution time. This diff resolves that issue by // registering this consume op with correct alias information which is DEFAULT. -auto reg = torch::RegisterOperators() - .op("operator_benchmark::_consume", &consume) - .op("operator_benchmark::_consume.list", &consume_list); +TORCH_LIBRARY_FRAGMENT(operator_benchmark, m) { + m.def("_consume", &consume); + m.def("_consume.list", &consume_list); +} PYBIND11_MODULE(cpp_extension, m) { m.def("_consume", &consume, "consume"); diff --git a/torch/csrc/autograd/record_function_ops.cpp b/torch/csrc/autograd/record_function_ops.cpp index 633d0f177295..da8cd22fbbc9 100644 --- a/torch/csrc/autograd/record_function_ops.cpp +++ b/torch/csrc/autograd/record_function_ops.cpp @@ -65,10 +65,10 @@ c10::intrusive_ptr _call_end_callbacks_on_fut( } // Internal only, do not use directly, use Python's record_function() -static auto registry = - RegisterOperators() - .op("profiler::_record_function_enter", &record_function_enter) - .op("profiler::_record_function_exit", &record_function_exit); +TORCH_LIBRARY_FRAGMENT(profiler, m) { + m.def("_record_function_enter", &record_function_enter); + m.def("_record_function_exit", &record_function_exit); +} // Needed to register JIT operator in operator registry below c10::AliasAnalysisKind aliasAnalysisFromSchema() { From dfa38087046d60d6408198075921f87ffff1c651 Mon Sep 17 00:00:00 2001 From: Martin Yuan Date: Wed, 9 Dec 2020 14:44:44 -0800 Subject: [PATCH 087/250] [PyTorch] Remove aten::native::empty usage in TensorIndexing (#49074) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49074 Try to resolve part of the github issue of https://github.com/pytorch/pytorch/issues/48684 . ```aten::native::empty()``` is referenced in TensorIndexing.h. However, the definition of ```aten::native::empty()``` is nothing but checks and eventually calling ```at::empty()```. In this diff, ```at::empty()``` is directly used to avoid the reference to native symbols. ghstack-source-id: 118165999 Test Plan: CI Reviewed By: dhruvbird Differential Revision: D25417854 fbshipit-source-id: 7e4af411ae63642c8470e78cf8553400dc9a16c9 --- aten/src/ATen/TensorIndexing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index a2bdc24ff51c..4b6f81bc4c21 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -228,7 +228,7 @@ static inline Tensor boolToIndexingTensorCPUOrCUDA(const Tensor& self, bool valu if (value) { return at::native::zeros({1}, {}, self.options().dtype(kLong)); } else { - return at::native::empty({0}, {}, self.options().dtype(kLong)); + return at::empty({0}, {}, self.options().dtype(kLong)); } } From 5765bbd78cd4db5deda4b6b3f4ecf06303aeca69 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Wed, 9 Dec 2020 15:09:23 -0800 Subject: [PATCH 088/250] Review memory overlap checks for advanced indexing operations (#48651) Summary: Fixes https://github.com/pytorch/pytorch/issues/45964 Indexing operators e.g. `scatter`/`gather` use tensor restriding so the `TensorIterator` built in overlap checking needs to be disabled. This adds the missing overlap checks for these operators. In addition, some indexing operators don't work will with `MemOverlapStatus::FULL` which is explicitly allowed by `assert_no_partial_overlap`. So, I've introduced `assert_no_overlap` that will raise an error on partial _or_ full overlap. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48651 Reviewed By: zhangguanheng66 Differential Revision: D25401047 Pulled By: ngimel fbshipit-source-id: 53abb41ac63c4283f3f1b10a0abb037169f20b89 --- aten/src/ATen/MemoryOverlap.cpp | 12 +++ aten/src/ATen/MemoryOverlap.h | 3 + .../ATen/native/TensorAdvancedIndexing.cpp | 57 +++++++++--- aten/src/ATen/native/cuda/IndexKernel.cu | 5 ++ aten/src/ATen/native/cuda/Indexing.cu | 6 ++ aten/src/TH/generic/THTensorEvenMoreMath.cpp | 8 ++ aten/src/THC/generic/THCTensorIndex.cu | 8 ++ test/test_torch.py | 88 ++++++++++++++++++- 8 files changed, 170 insertions(+), 17 deletions(-) diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp index 264271d35229..a9128e0e94ed 100644 --- a/aten/src/ATen/MemoryOverlap.cpp +++ b/aten/src/ATen/MemoryOverlap.cpp @@ -75,4 +75,16 @@ void assert_no_partial_overlap(TensorImpl* a, TensorImpl* b) { "Please clone() the tensor before performing the operation."); } +void assert_no_overlap(const Tensor& a, const Tensor& b) { + assert_no_overlap(a.unsafeGetTensorImpl(), b.unsafeGetTensorImpl()); +} + +void assert_no_overlap(TensorImpl* a, TensorImpl* b) { + const auto lap = get_overlap_status(a, b); + TORCH_CHECK(lap != MemOverlapStatus::PARTIAL && lap != MemOverlapStatus::FULL, + "unsupported operation: some elements of the input tensor and " + "the written-to tensor refer to a single memory location. " + "Please clone() the tensor before performing the operation."); +} + } diff --git a/aten/src/ATen/MemoryOverlap.h b/aten/src/ATen/MemoryOverlap.h index 67f63a64668c..5cd4eab2db9c 100644 --- a/aten/src/ATen/MemoryOverlap.h +++ b/aten/src/ATen/MemoryOverlap.h @@ -27,4 +27,7 @@ CAFFE2_API MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b); CAFFE2_API void assert_no_partial_overlap(const Tensor& a, const Tensor& b); void assert_no_partial_overlap(TensorImpl* a, TensorImpl* b); +CAFFE2_API void assert_no_overlap(const Tensor& a, const Tensor& b); +CAFFE2_API void assert_no_overlap(TensorImpl* a, TensorImpl* b); + } diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index ddc3ca8c2b34..8b5fdd44d789 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -293,6 +293,10 @@ Tensor index(const Tensor & self, TensorList indices) { Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) { TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); at::assert_no_internal_overlap(result); + at::assert_no_overlap(result, self); + for (auto& index: indices) { + at::assert_no_overlap(result, index); + } auto info = make_info(self, indices); auto iter = make_index_out_iterator(info, result); @@ -305,21 +309,24 @@ Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value, } Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate, const bool unsafe) { - TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); - if (accumulate && self.device().type() == kCUDA) { - TORCH_CHECK(value.device() == self.device(), "expected device ", self.device(), " but got device ", - value.device(), " for value tensor"); - index_put_accum_stub(self.device().type(), self, indices, value, unsafe); - return self; - } - + TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")"); if (at::has_internal_overlap(self) == MemOverlap::YES) { TORCH_WARN( "Use of index_put_ on expanded tensors is deprecated. " "Please clone() the tensor before performing this operation. " "This also applies to advanced indexing e.g. tensor[indices] = tensor"); } - at::assert_no_partial_overlap(self, value); + at::assert_no_overlap(self, value); + for (auto& index: indices) { + at::assert_no_overlap(self, index); + } + + if (accumulate && self.device().type() == kCUDA) { + TORCH_CHECK(value.device() == self.device(), "expected device ", self.device(), " but got device ", + value.device(), " for value tensor"); + index_put_accum_stub(self.device().type(), self, indices, value, unsafe); + return self; + } auto info = make_info(self, indices); auto iter = make_index_put_iterator(info, value); @@ -339,6 +346,9 @@ Tensor & index_copy_(Tensor & self, int64_t dim, const Tensor & index, const Ten dim = maybe_wrap_dim(dim, self.dim()); TORCH_CHECK_INDEX(index.dim() < 2, "index_copy_(): Index should have dimension 1 or 0 (got ", index.dim(), ")"); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, index); + at::assert_no_overlap(self, source); int64_t numIndices = index.numel(); if (source.dim() == 0 && numIndices != 1) { @@ -394,8 +404,8 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T "index_add_(): Number of indices should be equal to self.size(dim)"); at::assert_no_internal_overlap(self); - at::assert_no_partial_overlap(self, index); - at::assert_no_partial_overlap(self, source); + at::assert_no_overlap(self, index); + at::assert_no_overlap(self, source); auto index_contig = index.contiguous(); @@ -472,6 +482,8 @@ Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor"); at::assert_no_internal_overlap(result); + at::assert_no_overlap(result, self); + at::assert_no_overlap(result, index); auto result_size = self.sizes().vec(); if (self.dim() > 0) { @@ -608,6 +620,9 @@ Tensor index_fill(const Tensor & self, int64_t dim, const Tensor & index, const Tensor & gather_out_cpu_cuda(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index, bool sparse_grad) { result.resize_(index.sizes()); + at::assert_no_internal_overlap(result); + at::assert_no_overlap(result, self); + at::assert_no_partial_overlap(result, index); gather_stub(result.device().type(), result, self, dim, index); return result; } @@ -627,6 +642,9 @@ Tensor gather_backward(const Tensor& grad, const Tensor& self, int64_t dim, cons Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & source) { TORCH_CHECK_INDEX(index.scalar_type() == ScalarType::Long, "scatter_(): Expected dtype int64 for index."); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, source); + at::assert_no_overlap(self, index); scatter_stub(self.device().type(), self, dim, index, source); return self; } @@ -634,6 +652,8 @@ Tensor & scatter_(Tensor & self, int64_t dim, const Tensor & index, const Tensor Tensor & scatter_fill_(Tensor & self, int64_t dim, const Tensor & index, Scalar source) { TORCH_CHECK_INDEX(index.scalar_type() == ScalarType::Long, "scatter_(): Expected dtype int64 for index."); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, index); scatter_fill_stub(self.device().type(), self, dim, index, source); return self; } @@ -657,6 +677,8 @@ Tensor& scatter_scalar_reduce_(Tensor& self, const int64_t dim, const Tensor& in "scatter_(): Expected dtype int64 for index."); TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), "scatter_(): Expected floating or complex type for self."); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, index); SCATTER_GATHER_OP op = get_operator_enum(reduce); scatter_scalar_reduce_stub(self.device().type(), self, dim, index, value, op); return self; @@ -668,6 +690,9 @@ Tensor & scatter_reduce_(Tensor & self, const int64_t dim, const Tensor & index, "scatter_(): Expected dtype int64 for index"); TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), "scatter_(): Expected floating or complex type for self."); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, index); + at::assert_no_overlap(self, src); SCATTER_GATHER_OP op = get_operator_enum(reduce); scatter_reduce_stub(self.device().type(), self, dim, index, src, op); return self; @@ -684,6 +709,9 @@ Tensor scatter(const Tensor & self, int64_t dim, const Tensor & index, Scalar so Tensor & scatter_add_(Tensor & self, int64_t dim, const Tensor & index, const Tensor & src) { TORCH_CHECK_INDEX(index.scalar_type() == ScalarType::Long, "scatter_(): Expected dtype int64 for index."); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, index); + at::assert_no_overlap(self, src); scatter_add_stub(self.device().type(), self, dim, index, src); return self; } @@ -780,8 +808,8 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self, "masked_select(): self and result must have the same scalar type"); at::assert_no_internal_overlap(result); - at::assert_no_partial_overlap(result, self); - at::assert_no_partial_overlap(result, mask); + at::assert_no_overlap(result, self); + at::assert_no_overlap(result, mask); if (mask.dtype() == at::ScalarType::Byte) { TORCH_WARN("masked_select received a mask with dtype torch.uint8, this behavior is now deprecated," \ @@ -895,6 +923,9 @@ void take_out_cpu_template( auto index_continuous = index.contiguous(); bool is_contiguous = input.is_contiguous(); auto input_size = input.numel(); + at::assert_no_internal_overlap(output); + at::assert_no_partial_overlap(output, index); + at::assert_no_overlap(output, input); AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cpu", [&] { auto output_data = output_contiguous.data_ptr(); diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index 7d7a59b32406..cb4aa644fee2 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace at { namespace native { @@ -229,6 +230,10 @@ void take_out_cuda_template(Tensor& output, const Tensor& input, const Tensor& i TORCH_CHECK(!(input.numel() == 0 && index.numel() != 0), "tried to take from an empty tensor"); + at::assert_no_internal_overlap(output); + at::assert_no_partial_overlap(output, index); + at::assert_no_overlap(output, input); + output.resize_(index.sizes()); AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::Half, input.scalar_type(), "take_cuda", [&] { diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 4e88ee34a9b4..2b81460c1a4b 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -446,6 +446,10 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const TORCH_CHECK(index.numel() == (source.dim() == 0 ? 1 : source.size(dim)), "index_add_(): Number of indices should be equal to self.size(dim)"); + at::assert_no_internal_overlap(self); + at::assert_no_overlap(self, index); + at::assert_no_overlap(self, source); + // Scalars are treated as 1-d tensor Tensor self_ = (self.dim() == 0) ? self.view(1) : self; Tensor source_ = (source.dim() == 0) ? source.view(1) : source; @@ -828,6 +832,8 @@ Tensor& index_select_out_cuda(Tensor& out, const Tensor& self, int64_t dim, TORCH_CHECK(at::cuda::check_device({out, self, index}), "Input, output and indices must be on the current device"); at::assert_no_internal_overlap(out); + at::assert_no_overlap(out, self); + at::assert_no_overlap(out, index); dim = at::maybe_wrap_dim(dim, self); TORCH_CHECK(self.dim() <= MAX_TENSORINFO_DIMS, DIM_WARNING); diff --git a/aten/src/TH/generic/THTensorEvenMoreMath.cpp b/aten/src/TH/generic/THTensorEvenMoreMath.cpp index 6a79f3e14c14..9c1eb3cdfe22 100644 --- a/aten/src/TH/generic/THTensorEvenMoreMath.cpp +++ b/aten/src/TH/generic/THTensorEvenMoreMath.cpp @@ -5,6 +5,7 @@ #include #include #include +#include // Finds non-zero elements of a tensor and returns their subscripts void THTensor_(nonzero)(THLongTensor *subscript, THTensor *tensor) @@ -254,6 +255,13 @@ void THTensor_(indexFill)(THTensor *tensor, int dim, THLongTensor *index, scalar numel = THLongTensor_nElement(index); THArgCheck(THTensor_nDimensionLegacyNoScalars(index) == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < THTensor_nDimensionLegacyNoScalars(tensor), 4,"Indexing dim %d is out of bounds of tensor", dim); + at::assert_no_overlap(tensor, index); + if (at::has_internal_overlap(tensor) == at::MemOverlap::YES) { + TORCH_WARN( + "Use of index_fill_ on expanded tensors is deprecated. " + "Please clone() the tensor before performing this operation. " + "This also applies to advanced indexing e.g. tensor[mask] = scalar"); + } index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index 07303fa47096..66ad275787f5 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -3,6 +3,7 @@ #else #include +#include // Check tensor dimensions for index operations, and return the slice size. // src can be nullptr in case of indexFill: in that case it is ignored. @@ -279,6 +280,13 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT THArgCheck(dims <= MAX_CUTORCH_DIMS, 2, CUTORCH_DIM_WARNING); dims = THCudaLongTensor_nDimensionLegacyNoScalars(state, indices); THArgCheck(dims <= MAX_CUTORCH_DIMS, 4, CUTORCH_DIM_WARNING); + at::assert_no_overlap(dst, indices); + if (at::has_internal_overlap(dst) == at::MemOverlap::YES) { + TORCH_WARN( + "Use of index_fill_ on expanded tensors is deprecated. " + "Please clone() the tensor before performing this operation. " + "This also applies to advanced indexing e.g. tensor[mask] = scalar"); + } // The `src` is partitioned into two parts: // -the size of each slice we are indexing, which is the diff --git a/test/test_torch.py b/test/test_torch.py index ad88128617c9..855005ae03f7 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -4994,6 +4994,7 @@ def test_ternary_op_mem_overlap(self, device, dtype): expected_failure=not has_input_output_mem_overlap_check) @dtypes(torch.double) + @onlyOnCPUAndCUDA def test_copy_mem_overlap(self, device, dtype): self.check_internal_mem_overlap( torch.Tensor.copy_, num_inputs=2, dtype=dtype, device=device) @@ -5002,14 +5003,49 @@ def test_copy_mem_overlap(self, device, dtype): self.unary_check_input_output_mem_overlap( doubles, sz, lambda input, out: out.copy_(input)) + @onlyOnCPUAndCUDA def test_index_add_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((6,)) y = torch.rand((6,), device=device) - ind = torch.tensor([0, 2, 3], device=device) + ind = torch.tensor([2, 1, 0], device=device) value = torch.rand((3,), device=device) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): x.index_add_(0, ind, value) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + y.index_add_(0, ind, y[:3]) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_add_(0, ind, ind.clone()) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_add_(0, ind.clone(), ind) + @onlyOnCPUAndCUDA + def test_index_copy_mem_overlap(self, device): + x = torch.rand((1,), device=device).expand((6,)) + y = torch.rand((6,), device=device) + ind = torch.tensor([2, 1, 0], device=device) + value = torch.rand((3,), device=device) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + x.index_copy_(0, ind, value) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + y.index_copy_(0, ind, y[:3]) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_copy_(0, ind, ind.clone()) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_copy_(0, ind.clone(), ind) + + @onlyOnCPUAndCUDA + def test_index_fill_mem_overlap(self, device): + x = torch.rand((1,), device=device).expand((6,)) + y = torch.rand((6,), device=device) + ind = torch.tensor([2, 1, 0], device=device) + value = torch.rand((3,), device=device) + + with self.assertWarnsRegex(UserWarning, "index_fill_ on expanded tensors"): + x.index_fill_(0, ind, 1.0) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_fill_(0, ind, 0) + + @onlyOnCPUAndCUDA def test_shift_mem_overlap(self, device): x = torch.rand(3, device=device) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): @@ -5017,6 +5053,7 @@ def test_shift_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): x[:-1] >>= x[1:] + @onlyOnCPUAndCUDA def test_bernoulli_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((6,)) @@ -5030,16 +5067,26 @@ def test_bernoulli_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): torch.bernoulli(torch.rand_like(x), out=x) + @onlyOnCPUAndCUDA def test_index_put_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((6,)) y = torch.rand((6,), device=device) - ind = torch.tensor([0, 2, 3], device=device) + ind = torch.tensor([2, 1, 0], device=device) value = torch.rand((3,), device=device) with self.assertWarnsRegex(UserWarning, 'expanded tensors'): x.index_put_((ind,), value) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): y.index_put_((ind,), y[0]) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_put_((ind,), ind) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + y.index_put_((ind,), y[:3]) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_put_((ind,), ind.clone()) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.index_put_((ind.clone(),), ind) + @onlyOnCPUAndCUDA def test_masked_fill_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((6,)) mask = torch.tensor([True, False, True, True, False, False], device=device) @@ -5050,13 +5097,22 @@ def test_masked_fill_mem_overlap(self, device): with self.assertWarnsRegex(UserWarning, 'expanded tensors'): x.masked_fill_(mask, fill_val) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + mask[1:].masked_fill_(mask[:-1], False) + + @onlyOnCPUAndCUDA def test_masked_select_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((3,)) y = torch.rand((6,), device=device) mask = torch.tensor([True, False, True, True, False, False], device=device) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): torch.masked_select(y, mask, out=x) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.masked_select(y, mask, out=y) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.masked_select(mask.clone(), mask, out=mask) + @onlyOnCPUAndCUDA def test_masked_scatter_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((6,)) src = torch.rand((3,), device=device) @@ -5065,6 +5121,7 @@ def test_masked_scatter_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): x.masked_scatter_(mask, src) + @onlyOnCPUAndCUDA def test_index_select_mem_overlap(self, device): x = torch.rand((1, 6), device=device).expand((2, 6)) y = torch.rand((3, 6), device=device) @@ -5072,20 +5129,43 @@ def test_index_select_mem_overlap(self, device): with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): torch.index_select(y, 1, ind, out=x) + @onlyOnCPUAndCUDA def test_scatter_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((6,)) src = torch.rand((3,), device=device) - ind = torch.tensor([0, 2, 3], device=device, dtype=torch.int64) + ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): x.scatter_(0, ind, src) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + src.scatter_(0, ind, src) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + ind.scatter_(0, ind, ind.clone()) + @onlyOnCPUAndCUDA def test_gather_mem_overlap(self, device): x = torch.rand((1,), device=device).expand((3,)) src = torch.rand((6,), device=device) - ind = torch.tensor([0, 2, 3], device=device, dtype=torch.int64) + ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64) with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): torch.gather(src, 0, ind, out=x) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.gather(src, 0, ind, out=src) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.gather(ind.clone(), 0, ind[1:], out=ind[:1]) + + @onlyOnCPUAndCUDA + def test_take_mem_overlap(self, device): + x = torch.rand((1,), device=device).expand((3,)) + src = torch.rand((6,), device=device) + ind = torch.tensor([2, 1, 0], device=device, dtype=torch.int64) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.take(src, ind, out=x) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.take(src, ind, out=src) + with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): + torch.take(ind.clone(), ind[1:], out=ind[:-1]) + @onlyCUDA def test_multinomial_device_constrain(self, device): From f431e47a2eaffe33ec69f2a6330fa7228c084962 Mon Sep 17 00:00:00 2001 From: peterjc123 Date: Wed, 9 Dec 2020 15:20:50 -0800 Subject: [PATCH 089/250] [collect_env] Acquire windows encoding using OEMCP (#49020) Summary: Fixes https://github.com/pytorch/pytorch/issues/49010. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49020 Reviewed By: zhangguanheng66 Differential Revision: D25398064 Pulled By: janeyx99 fbshipit-source-id: c7fd1e7d1f3dd82613d7f2031439503188b144fd --- torch/utils/collect_env.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py index 3fac12f60774..5b91c7a9a0fa 100644 --- a/torch/utils/collect_env.py +++ b/torch/utils/collect_env.py @@ -43,7 +43,10 @@ def run(command): stderr=subprocess.PIPE, shell=True) raw_output, raw_err = p.communicate() rc = p.returncode - enc = locale.getpreferredencoding() + if get_platform() == 'win32': + enc = 'oem' + else: + enc = locale.getpreferredencoding() output = raw_output.decode(enc) err = raw_err.decode(enc) return rc, output.strip(), err.strip() From a6fa3b26825fef139aecf7b931aefc7a4c1822ac Mon Sep 17 00:00:00 2001 From: jiej Date: Wed, 9 Dec 2020 15:27:23 -0800 Subject: [PATCH 090/250] adding profile_ivalue (#47666) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47666 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D25255573 Pulled By: Krovatkin fbshipit-source-id: 5d8753e4040a3d96105d28d26728125947c7a638 --- aten/src/ATen/core/interned_strings.h | 1 + torch/csrc/jit/codegen/cuda/interface.cpp | 4 ++-- torch/csrc/jit/codegen/cuda/interface.h | 3 ++- .../jit/codegen/cuda/register_interface.cpp | 3 ++- torch/csrc/jit/ir/alias_analysis.cpp | 1 + torch/csrc/jit/ir/ir.cpp | 11 +++++++++ torch/csrc/jit/ir/ir.h | 24 ++++++++++++++++++- torch/csrc/jit/runtime/interpreter.cpp | 4 ++++ torch/csrc/jit/runtime/operator.cpp | 2 ++ .../jit/runtime/register_prim_ops_fulljit.cpp | 10 +++++++- 10 files changed, 57 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 5a0efffea261..7a74ec3b1736 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -139,6 +139,7 @@ namespace c10 { _(prim, HasAttr) \ _(prim, profile) \ _(prim, profile_optional) \ + _(prim, profile_ivalue) \ _(prim, AddStatValue) \ _(prim, TimePoint) \ _(prim, CallFunction) \ diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp index 8bc3ba3b4c6f..e3efd924efb6 100644 --- a/torch/csrc/jit/codegen/cuda/interface.cpp +++ b/torch/csrc/jit/codegen/cuda/interface.cpp @@ -36,9 +36,9 @@ void runFusionGroup(const Node* fusion_node, Stack& stack) { void fuseGraph(std::shared_ptr& graph) { TORCH_CHECK( - getFuserInterface()->fn_fuse_graph != nullptr, + getFuserInterface()->fn_fuse_graph_ != nullptr, "Running the CUDA fuser requires a CUDA build."); - getFuserInterface()->fn_fuse_graph(graph); + getFuserInterface()->fn_fuse_graph_(graph); } bool canFuseNode(const Node* node) { diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h index 7c156b1dc7c9..00d94a9f12e0 100644 --- a/torch/csrc/jit/codegen/cuda/interface.h +++ b/torch/csrc/jit/codegen/cuda/interface.h @@ -2,6 +2,7 @@ #include #include +#include /* * This file contains APIs for cuda fuser; @@ -22,7 +23,7 @@ TORCH_API std::atomic& getCudaFusionGuardMode(); struct CudaFuserInterface { void (*fn_compile_n_)(Node*) = nullptr; void (*fn_run_n_s_)(const Node*, Stack&) = nullptr; - void (*fn_fuse_graph)(std::shared_ptr&) = nullptr; + void (*fn_fuse_graph_)(std::shared_ptr&) = nullptr; bool (*fn_can_fuse_n_)(const Node*) = nullptr; }; diff --git a/torch/csrc/jit/codegen/cuda/register_interface.cpp b/torch/csrc/jit/codegen/cuda/register_interface.cpp index f340a903131d..284ee05420a1 100644 --- a/torch/csrc/jit/codegen/cuda/register_interface.cpp +++ b/torch/csrc/jit/codegen/cuda/register_interface.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -20,7 +21,7 @@ class RegisterInterface { auto ptr = getFuserInterface(); ptr->fn_compile_n_ = &compileCudaFusionGroup; ptr->fn_run_n_s_ = &runCudaFusionGroup; - ptr->fn_fuse_graph = &CudaFuseGraph; + ptr->fn_fuse_graph_ = &CudaFuseGraph; ptr->fn_can_fuse_n_ = &isFusableCudaFusionGroup; RegisterProfilingNode(canFuseNode); diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp index b055d29164a5..67dbe193f11c 100644 --- a/torch/csrc/jit/ir/alias_analysis.cpp +++ b/torch/csrc/jit/ir/alias_analysis.cpp @@ -524,6 +524,7 @@ void AliasDb::analyzeImpl(Node* node) { case prim::SetAttr: return analyzeSetAttr(node); case prim::profile_optional: + case prim::profile_ivalue: case prim::profile: makePointerTo(node->output(), node->inputs().at(0)); return; diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index ceb0fd1dbfcf..4714a6ae12f6 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -2053,6 +2053,16 @@ Node* ProfileOptionalOp::allocNewInstance(Graph* g) { return new ProfileOptionalOp(g, {nullptr}); } +void ProfileIValueOp::cloneFrom(Node* other_) { + Node::cloneFrom(other_); + auto other = other_->cast(); + this->callback_ = other->getCallback(); +} + +Node* ProfileIValueOp::allocNewInstance(Graph* g) { + return new ProfileIValueOp(g, {nullptr}); +} + TypePtr NamedValue::type() const { if (value_) { return value_->type(); @@ -2063,6 +2073,7 @@ TypePtr NamedValue::type() const { const Symbol ProfileOp::Kind = ::c10::prim::profile; const Symbol ProfileOptionalOp::Kind = ::c10::prim::profile_optional; +const Symbol ProfileIValueOp::Kind = ::c10::prim::profile_ivalue; OperatorSet::OperatorSet(std::initializer_list sig_literals) { for (const char* sig : sig_literals) { diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index 64c8031bd601..b20d5611c55c 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -440,7 +440,7 @@ struct TORCH_API Node { // instructions lowered by the interpreter and not run in the optimized graph bool notExecutedOp() const { return kind_ == prim::Constant || kind_ == prim::profile || - kind_ == prim::profile_optional; + kind_ == prim::profile_optional || kind_ == prim::profile_ivalue; } // Graphs @@ -1368,6 +1368,28 @@ struct TORCH_API ProfileOptionalOp : public Node { std::function&)> callback_; }; +struct TORCH_API ProfileIValueOp : public Node { + static const Symbol Kind; + ProfileIValueOp( + Graph* graph, + std::function&)> callback) + : Node(graph, ::c10::prim::profile_ivalue), callback_(callback) {} + + void cloneFrom(Node* other_) override; + Node* allocNewInstance(Graph* g) override; + + const std::function&)>& getCallback() const { + return callback_; + } + + void setCallback(std::function&)> callback) { + callback_ = callback; + } + + private: + std::function&)> callback_; +}; + // execute a Python function, used for Ops we can't optimize but that we want to // optimize around // diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index ef0f2dae9e0e..4802fd2efafa 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -791,6 +791,9 @@ struct CodeImpl { } else if (node->cast()) { profile_function_table_.push_back( node->cast()->getCallback()); + } else if (node->cast()) { + profile_function_table_.push_back( + node->cast()->getCallback()); } else { TORCH_INTERNAL_ASSERT(false); } @@ -945,6 +948,7 @@ struct CodeImpl { case prim::BailOut: emitBailOut(node); break; + case prim::profile_ivalue: case prim::profile_optional: case prim::profile: emitProfile(node); diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp index 0756d6b58e9f..b9e0f5fbd3fe 100644 --- a/torch/csrc/jit/runtime/operator.cpp +++ b/torch/csrc/jit/runtime/operator.cpp @@ -245,6 +245,7 @@ bool printerHasSpecialCaseFor(Symbol sym) { prim::Store, // used in interpreter only prim::profile, // used in interpreter only prim::profile_optional, // used in interpreter only + prim::profile_ivalue, // used in interpreter only prim::TypeCheck, // used in interpreter only prim::FallbackGraph, // converted into prim::CallFunction @@ -303,6 +304,7 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) { prim::SetAttr, prim::profile, prim::profile_optional, + prim::profile_ivalue, prim::TypeCheck, prim::Print, prim::CallFunction, diff --git a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp index b63a2a228508..8361fb3b3385 100644 --- a/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops_fulljit.cpp @@ -29,7 +29,15 @@ RegisterOperators reg( {Operator( prim::profile, [](const Node* node) -> Operation { - auto callback = node->cast()->getCallback(); + return [](Stack* stack) { + AT_ERROR( + "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT + }; + }, + aliasAnalysisSpecialCase()), + Operator( + prim::profile_ivalue, + [](const Node* node) -> Operation { return [](Stack* stack) { AT_ERROR( "Must be lowered to Interpreter's PROFILE instruction"); // NOLINT From 16b8e6ab011c56ccc6a58f7764cd0592f89f35ac Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 9 Dec 2020 15:37:32 -0800 Subject: [PATCH 091/250] Class-based structured kernels, with migration of add to framework (#48718) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48718 This PR rewrites structured kernels to do the class-based mechanism (instead of defining a meta and impl function, they are methods on a class), and adds enough customizability on the class to support TensorIterator. To show it works, add is made a structured kernel. Don't forget to check https://github.com/pytorch/rfcs/pull/9 for a mostly up-to-date high level description of what's going on here. High level structure of this PR (the order you should review files): * TensorMeta.h - TensorMeta is deleted entirely; instead, meta functions will call `set_output` to allocate/resize their outputs. MetaBase gets a new `maybe_get_output` virtual method for retrieving the (possibly non-existent) output tensor in a meta function; this makes it easier to do special promotion behavior, e.g., as in TensorIterator. * TensorIterator.cpp - Two major changes: first, we add TensorIteratorBase::set_output, which is a "light" version of TensorIterator::set_output; it sets up the internal data structures in TensorIterator, but it doesn't do allocation (that is assumed to have been handled by the structured kernels framework). The control flow here is someone will call the subclassed set_output, which will allocate output, and then we will call the parent class (TensorIteratorBase) to populate the fields in TensorIterator so that other TensorIterator phases can keep track of it. Second, we add some tests for meta tensors, and skip parts of TensorIterator which are not necessary when data is not available. * tools/codegen/model.py - One new field in native_functions.yaml, structured_inherits. This lets you override the parent class of a structured meta class; normally it's MetaBase, but you can make it point at TensorIteratorBase instead for TensorIterator based kernels * tools/codegen/gen.py - Now generate all of the classes we promised. It's kind of hairy because this is the first draft. Check the RFC for what the output looks like, and then follow the logic here. There are some complications: I need to continue to generate old style wrapper functions even if an operator is structured, because SparseCPU/SparseCUDA/etc won't actually use structured kernels to start. The most complicated code generation is the instantiation of `set_output`, which by in large replicates the logic in `TensorIterator::set_output`. This will continue to live in codegen for the forseeable future as we would like to specialize this logic per device. * aten/src/ATen/native/UpSampleNearest1d.cpp - The previous structured kernel is ported to the new format. The changes are very modest. * aten/src/ATen/native/BinaryOps.cpp - Add is ported to structured. TODO: * Work out an appropriate entry point for static runtime, since native:: function stubs no longer are generated * Refactor TensorIteratorConfig construction into helper functions, like before * Make Tensor-Scalar addition structured to fix perf regression * Fix `verify_api_visibility.cpp` * Refactor tools/codegen/gen.py for clarity * Figure out why header changes resulted in undefined reference to `at::Tensor::operator[](long) const` Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D25278031 Pulled By: ezyang fbshipit-source-id: 57c43a6e5df21929b68964d485995fbbae4d1f7b --- aten/src/ATen/TensorIterator.cpp | 151 +++++++-- aten/src/ATen/TensorIterator.h | 8 + aten/src/ATen/TensorMeta.cpp | 16 - aten/src/ATen/TensorMeta.h | 56 +++- aten/src/ATen/native/BinaryOps.cpp | 74 ++--- aten/src/ATen/native/BinaryOps.h | 4 +- aten/src/ATen/native/UpSampleNearest1d.cpp | 18 +- aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | 2 +- .../ATen/native/cuda/BinaryAddSubKernel.cu | 2 +- aten/src/ATen/native/cuda/Loops.cuh | 5 + .../src/ATen/native/cuda/UpSampleNearest1d.cu | 23 +- .../native/metal/mpscnn/tests/MPSCNNTests.mm | 2 +- aten/src/ATen/native/native_functions.yaml | 4 + aten/src/ATen/templates/MetaFunctions.h | 1 + .../ATen/templates/RegisterDispatchKey.cpp | 3 + benchmarks/static_runtime/deep_wide_pt.h | 4 +- tools/codegen/api/meta.py | 6 +- tools/codegen/api/types.py | 4 + tools/codegen/gen.py | 288 ++++++++++++++---- tools/codegen/model.py | 12 + torch/csrc/jit/runtime/static/ops.cpp | 33 +- 21 files changed, 506 insertions(+), 210 deletions(-) diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 0f18d941feff..3f5f9280eb99 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -402,14 +402,14 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) { // TODO: reuse temporaries when possible (e.g. for inplace operations) if (common_device == kCPU) { // Casts to outputs by creating temporaries of the correct dtype (if needed) - if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_) { + // NB: we skip this on is_meta_, because the temporary allocation here is + // unnecessary if we aren't going to actually do the compute + if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_ && !is_meta_) { TORCH_INTERNAL_ASSERT(op.tensor.defined()); + // Marker [Output original_tensor is set] op.original_tensor = op.tensor; // NB: do NOT use set_output here, as the temporary is NOT a true output; // op.tensor is the true output and it was pre-provided for us. - // TODO: When we extend this to work with meta tensors, we'll need to - // skip this temporary allocation in that case (because it's - // unnecessary) // TODO: The logic for cast_outputs will need to be handled by the // structured kernels implementation. What probably should happen // is that we pass in the inferred dtype into the out kernel, and @@ -488,10 +488,10 @@ void TensorIteratorBase::allocate_or_resize_outputs() { set_output(i, tensor_shape, tensor_stride, op.options(), names_); } op.current_dtype = op.target_dtype; - } else if (op.tensor.defined() && !names_.empty()) { - // Even if we don't resize, we may still propagate names, esp - // if we were doing an inplace operation - namedinference::propagate_names(op.tensor, names_); + } else if (op.tensor.defined()) { + // Even if we don't resize, we still need to tell set_output about + // the output, so that we properly set guard and propagate names + set_output(i, op.tensor.sizes(), {}, op.tensor.options(), names_); } } } @@ -765,6 +765,8 @@ void TensorIteratorBase::cast_outputs() { for (auto& op : operands_) { if (op.is_output && op.original_tensor.defined() && op.original_tensor.scalar_type() != op.current_dtype) { + // TODO: Now that set_output resizes both the original_tensor + // and tensor, this condition should no longer ever be true if (op.original_tensor.sizes() != op.tensor.sizes()){ op.original_tensor.resize_as_(op.tensor).as_strided_(op.tensor.sizes(), op.tensor.strides()); } @@ -808,18 +810,22 @@ void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indic } } -TensorIterator TensorIterator::binary_op(Tensor& out, const Tensor& a, - const Tensor& b) { - return TensorIteratorConfig() - .set_check_mem_overlap(true) - .add_output(out) - .add_input(a) - .add_input(b) - .allow_cpu_scalars(true) - .promote_inputs_to_common_dtype(true) - .cast_common_dtype_to_outputs(true) - .enforce_safe_casting_to_output(true) - .build(); +void TensorIteratorBase::build_binary_op(const Tensor& out, const Tensor& a, const Tensor& b) { + build(TensorIteratorConfig() + .set_check_mem_overlap(true) + .add_output(out) + .add_input(a) + .add_input(b) + .allow_cpu_scalars(true) + .promote_inputs_to_common_dtype(true) + .cast_common_dtype_to_outputs(true) + .enforce_safe_casting_to_output(true)); +} + +TensorIterator TensorIterator::binary_op(Tensor& out, const Tensor& a, const Tensor& b) { + TensorIterator iter; + iter.build_binary_op(out, a, b); + return iter; } // Helper to construct a binary op that promotes integer inputs to float. @@ -940,6 +946,13 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) { for (auto& tensor: config.tensors_) { + // If *any* of the arguments is a meta tensor, the overall + // computation is a meta computation (don't do any work, + // just compute output information). This aligns with + // our multiple dispatch semantics. + if (tensor.is_meta()) { + is_meta_ = true; + } operands_.emplace_back(std::move(tensor)); } num_outputs_ = config.num_outputs_; @@ -988,6 +1001,10 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config if (!config.check_mem_overlap_) { return; } + if (is_meta_) { + // We don't have pointer addresses, cannot check for overlap! + return; + } for (int i = 0; i < num_outputs_; i++) { const auto& output = operands_[i].tensor; if (!output.defined()) continue; @@ -1265,9 +1282,11 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { // allocate the output tensor if it's not provided allocate_or_resize_outputs(); // coalesce adjacent dimensions when possible - coalesce_dimensions(); + if (!is_meta_) coalesce_dimensions(); } + if (is_meta_) return; + for (auto& op : operands_) { TORCH_INTERNAL_ASSERT(op.tensor.defined()); op.data = op.tensor.data_ptr(); @@ -1281,14 +1300,92 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { view_offsets_ = DimVector(ndim_offsets, 0); } +// This is the structured kernels implementation of set_output. It is +// NEVER actually called directly; instead, a subclass of TensorIteratorBase +// will override set_output to actually do the operation, and then call +// set_output on the TensorIteratorBase to setup TI's metadata. +// The precondition for this function is that maybe_get_output() now +// unconditionally returns a real Tensor (prior to output setting, +// this function may return an undefined tensor.) +void TensorIteratorBase::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { + auto& op = operands_[output_idx]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_); + const auto& t = maybe_get_output(output_idx); + TORCH_INTERNAL_ASSERT(t.defined()); + if (!op.tensor.defined()) { + op.tensor = t; + op.current_dtype = op.target_dtype; + } else if (op.will_resize) { + if (op.original_tensor.defined()) { + // OK, so this is pretty weird. To understand how we can end up in + // this situation, first look at Marker [Output original_tensor is set]. + // That is the sole site where original_tensor may be set on an + // output operand. Essentially, when we are given an explicit output + // tensor whose dtype doesn't match the computed common dtype from + // the input operands, we do a switcheroo: we replace the (incorrectly + // typed) output tensor with a correctly typed, *temporary* tensor, + // and remember the original tensor in original_tensor (which will + // then get written back to when we cast_outputs). + // + // Now, what if the given output tensor also happened to be zero + // size (meaning that we will_resize it)? Well, at the call site + // above, we don't necessarily(*) know what the correct shape should + // be, so we give the temporary tensor the same shape as the original. + // At the time of set_output is when we DO know what the correct size + // is, and the subclass's implementation of set_output in structured class + // responsible for resizing original_tensor. But we still have this + // incorrectly sized temporary output which the structured subclass + // knows nothing about, so we are obligated to also resize it here. + // + // This is a slight memory pessimization, because previously + // original_tensor only got resized at the end of the computation, rather + // than at the beginning (as happens here). However, the peak memory + // usage is the same, since you need to materialize both original tensor + // and temporary tensor to do the copy. + // + // (*) Actually, technically, we probably do know what the shape + // should be, since we do shape computation before dtype computation. + // So hypothetically we could figure out what the correct shape is + // at that point in time and directly allocate the temporary at + // the right size. + // + // But a better solution is to delay allocation of temporaries until + // after TensorIterator builder, waiting until we actually want + // to do the computation. That would also remove the necessity + // for the is_meta_ test. + TORCH_INTERNAL_ASSERT(op.original_tensor.is_same(t)); + TORCH_INTERNAL_ASSERT(!op.tensor.is_same(t)); + at::native::resize_output(op.tensor, sizes); + if (!strides.empty()) { + TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); + op.tensor.as_strided_(sizes, strides); + } else if (options.memory_format_opt().has_value()) { + op.tensor.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); + } + } + } +} + +// This is the "traditional" implementation of set_output. On TensorIterator +// instances, it is invoked directly from various call sites in this file. No +// funny business. void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { + // NB: intentionally no superclass call auto& op = operands_[output_idx]; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_); if (!op.tensor.defined()) { if (strides.empty()) { - op.tensor = at::empty(sizes, options); + if (is_meta_) { + op.tensor = at::empty_meta(sizes, options); + } else { + op.tensor = at::empty(sizes, options); + } } else { - op.tensor = at::empty_strided(sizes, strides, options); + if (is_meta_) { + TORCH_INTERNAL_ASSERT(0, "meta strided not yet implemented"); + } else { + op.tensor = at::empty_strided(sizes, strides, options); + } } op.current_dtype = op.target_dtype; } else if (op.will_resize) { @@ -1306,6 +1403,14 @@ void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayR } } +// Not actually used by anything (TensorIterator subclass calls +// its own implementation of set_output which knows exactly where +// all the outputs are), but we have to provide all pure virtual methods +// for MetaBase +const Tensor& TensorIterator::maybe_get_output(int64_t output_idx) { + return operands_[output_idx].tensor; +} + SplitUntil32Bit TensorIteratorBase::with_32bit_indexing() const { return SplitUntil32Bit(*this); } diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index 11dbda5c7959..ba781d7501e6 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -297,6 +297,10 @@ struct CAFFE2_API TensorIteratorBase : public impl::MetaBase { return true; } + void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; + + void build_binary_op(const Tensor& out, const Tensor& a, const Tensor& b); + protected: // Mutable reference as it moves tensors out of TensorIteratorConfig void populate_operands(TensorIteratorConfig&); @@ -399,6 +403,9 @@ struct CAFFE2_API TensorIteratorBase : public impl::MetaBase { // From TensorIteratorConfig bool is_reduction_ = false; + + /// Set by populate_operands(), says if we're handling meta tensors + bool is_meta_ = false; }; struct CAFFE2_API TensorIterator final : public TensorIteratorBase { @@ -415,6 +422,7 @@ struct CAFFE2_API TensorIterator final : public TensorIteratorBase { static TensorIterator reduce_op(Tensor& out, const Tensor& a); static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a); + const Tensor& maybe_get_output(int64_t output_idx) override; void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; }; diff --git a/aten/src/ATen/TensorMeta.cpp b/aten/src/ATen/TensorMeta.cpp index 30dca8ccaf2e..6f4d667d5653 100644 --- a/aten/src/ATen/TensorMeta.cpp +++ b/aten/src/ATen/TensorMeta.cpp @@ -1,21 +1,5 @@ #include -#include namespace at { -Tensor meta_tensor_from_meta(const TensorMeta& meta) { - // TODO: eliminate indirection - return at::empty_meta(meta.sizes, meta.options); -} - -Tensor tensor_from_meta(const TensorMeta& meta) { - // TODO: eliminate indirection - return at::empty(meta.sizes, meta.options); -} - -// Analogous to self.new_empty(sizes) -TensorMeta new_meta(const Tensor& self, IntArrayRef sizes) { - return TensorMeta(sizes, self.options()); -} - } // namespace at diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h index baa6e6112b34..134bb373e3b2 100644 --- a/aten/src/ATen/TensorMeta.h +++ b/aten/src/ATen/TensorMeta.h @@ -10,28 +10,54 @@ class Tensor; namespace impl { -struct MetaBase { +// Use this to define the prototype for a meta function. There are two +// versions; one that takes one argument (just the operator name), or FUNC2 +// variant that takes two arguments (operator name and overload name). +// +// Example usage: +// +// TORCH_META_FUNC2(add, Tensor) ( +// const Tensor& self, const Tensor& other +// ) { +// ... compute sizes and options ... +// set_output(sizes, options); +// } +// +#define TORCH_META_FUNC(name) void name::meta +#define TORCH_META_FUNC2(name, overload) void name##_##overload::meta + +// Use this to define the prototype for an implementation. This takes only +// one argument, which is the name of the dispatch key entry you're +// implementing. +// +// Example usage: +// +// TORCH_IMPL_FUNC(add_cpu) ( +// Tensor& result, const Tensor& self, const Tensor& other +// ) { +// ... do the actual implementation ... +// } +// +#define TORCH_IMPL_FUNC(name) void structured_##name::impl + +// Base class for all structured kernel classes. The set_output virtual +// method is varied depending whether or not the operator is +// functional/out/inplace, and could also be specialized for CPU/CUDA/etc +// (although presently it isn't). +// +// A notable subclass of this interface is TensorIteratorBase. +struct CAFFE2_API MetaBase { virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0; + virtual const Tensor& maybe_get_output(int64_t output_idx) = 0; void set_output(IntArrayRef sizes, TensorOptions options) { set_output(0, sizes, {}, options, {}); } + // Returns a reference to an undefined tensor if there is no presupplied + // output + const Tensor& maybe_get_output() { return maybe_get_output(0); } virtual ~MetaBase() {} }; } // namespace impl -struct TensorMeta { - DimVector sizes; - // TODO: DimVector strides; - TensorOptions options; - - TensorMeta(IntArrayRef _sizes, TensorOptions _options) - : sizes(_sizes), options(_options) {} -}; - -CAFFE2_API Tensor meta_tensor_from_meta(const TensorMeta& meta); -CAFFE2_API Tensor tensor_from_meta(const TensorMeta& meta); -// Analogous to self.new_empty(sizes) -CAFFE2_API TensorMeta new_meta(const Tensor& self, IntArrayRef sizes); - } // namespace at diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index d6cb17418365..0618bbf6260c 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -11,6 +11,18 @@ #include namespace at { +namespace meta { + +TORCH_META_FUNC2(add, Tensor) ( + const Tensor& self, const Tensor& other, Scalar alpha +) { + build_binary_op(maybe_get_output(), self, other); + native::alpha_check(dtype(), alpha); +} + +} // namespace meta + + namespace native { DEFINE_DISPATCH(add_stub); @@ -57,24 +69,11 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) { return tensor; } -Tensor& add_out(Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha) { - auto iter = TensorIterator::binary_op(result, self, other); - alpha_check(iter.dtype(), alpha); - add_stub(iter.device_type(), iter, alpha); - TORCH_INTERNAL_ASSERT(result.scalar_type() == iter.output().dtype()); - return result; -} - -Tensor add(const Tensor& self, const Tensor& other, Scalar alpha) { - Tensor result; - auto iter = TensorIterator::binary_op(result, self, other); - alpha_check(iter.dtype(), alpha); - add_stub(iter.device_type(), iter, alpha); - return iter.output(); -} - -Tensor& add_(Tensor& self, const Tensor& other, Scalar alpha) { - return native::add_out(self, self, other, alpha); +TORCH_IMPL_FUNC(add_out) ( + Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha +) { + add_stub(device_type(), *this, alpha); + TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype()); } Tensor& add_relu_impl( @@ -449,12 +448,15 @@ static Tensor wrapped_scalar_tensor_and_check_convert(Scalar scalar, Tensor tens return wrapped_scalar_tensor(scalar); } +// TODO: Make this structured to undo the perf regression from native:: removal +// in call here + Tensor add(const Tensor& self, Scalar other, Scalar alpha) { - return native::add(self, wrapped_scalar_tensor(other), alpha); + return at::add(self, wrapped_scalar_tensor(other), alpha); } Tensor& add_(Tensor& self, Scalar other, Scalar alpha) { - return native::add_(self, wrapped_scalar_tensor(other), alpha); + return self.add_(wrapped_scalar_tensor(other), alpha); } Tensor remainder(const Tensor& self, Scalar other) { @@ -1099,37 +1101,5 @@ Tensor& ldexp_(Tensor& self, const Tensor& other) { return at::ldexp_out(self, self, other); } -// TODO: Deduplicate this with the TensorIterator logic. This would -// also fix the TODOs below. -Tensor binary_op_meta(const Tensor& self, const Tensor& other) { - // TODO: Doesn't do type promotion correctly - // TODO: Doesn't do strides correctly - int64_t dim = std::max(self.dim(), other.dim()); - std::vector sizes(dim); - for (int64_t i = 0; i < dim; i++) { - int64_t j = -1 - i; - if (i >= self.dim() || self.size(j) == 1) { - sizes[dim + j] = other.size(j); - } else if (i >= other.dim() || self.size(i) == 1) { - sizes[dim + j] = self.size(j); - } else { - TORCH_CHECK( - self.size(j) == other.size(j), - "Expected self.size(", j, ") == other.size(", j, "), but got ", self.size(j), " != ", other.size(j) - ); - sizes[dim + j] = self.size(j); - } - } - return at::empty_meta(sizes, self.options()); -} - -Tensor binary_op_with_scalar_meta(const Tensor& self, const Tensor& other, Scalar x) { - return binary_op_meta(self, other); -} - -TORCH_LIBRARY_IMPL(aten, Meta, m) { - m.impl("add.Tensor", binary_op_with_scalar_meta); -} - } // namespace native } // namespace at diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h index d76dd9d205e9..1fdb80590b5a 100644 --- a/aten/src/ATen/native/BinaryOps.h +++ b/aten/src/ATen/native/BinaryOps.h @@ -25,13 +25,15 @@ inline void sub_check(const Tensor& self, const Tensor& other) { "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead."); } +using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, Scalar alpha); + using binary_fn_alpha = void(*)(TensorIterator&, Scalar alpha); using binary_fn_beta = void(*)(TensorIterator&, double beta); using binary_fn = void(*)(TensorIterator&); using binary_clamp_fn_alpha = void(*)(TensorIterator&, Scalar alpha, Scalar min_val, Scalar max_val); -DECLARE_DISPATCH(binary_fn_alpha, add_stub); +DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub); DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub); DECLARE_DISPATCH(binary_fn_alpha, sub_stub); DECLARE_DISPATCH(binary_fn, mul_stub); diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp index 45ac307ee4fc..b9dd52dffa5d 100644 --- a/aten/src/ATen/native/UpSampleNearest1d.cpp +++ b/aten/src/ATen/native/UpSampleNearest1d.cpp @@ -34,7 +34,9 @@ static std::array upsample_nearest1d_common_check(IntArrayRef input_ return {nbatch, channels, output_width}; } -TensorMeta upsample_nearest1d(const Tensor& input, IntArrayRef output_size, c10::optional scales) { +TORCH_META_FUNC(upsample_nearest1d) ( + const Tensor& input, IntArrayRef output_size, c10::optional scales +) { auto full_output_size = upsample_nearest1d_common_check(input.sizes(), output_size); // Allow for empty batch size but not other dimensions @@ -43,17 +45,19 @@ TensorMeta upsample_nearest1d(const Tensor& input, IntArrayRef output_size, c10: "Non-empty 3D data tensor expected but got a tensor with sizes ", input.sizes()); - return new_meta(input, full_output_size); + set_output(full_output_size, input.options()); } -TensorMeta upsample_nearest1d_backward(const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional scales) { +TORCH_META_FUNC(upsample_nearest1d_backward) ( + const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, c10::optional scales +) { auto full_output_size = upsample_nearest1d_common_check(input_size, output_size); check_dim_size(grad_output, 3, 0, full_output_size[0]); check_dim_size(grad_output, 3, 1, full_output_size[1]); check_dim_size(grad_output, 3, 2, full_output_size[2]); - return new_meta(grad_output, input_size); + set_output(input_size, grad_output.options()); } } // namespace meta @@ -61,16 +65,15 @@ TensorMeta upsample_nearest1d_backward(const Tensor& grad_output, IntArrayRef ou namespace native { -Tensor& upsample_nearest1d_out_cpu( +TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) ( Tensor& output, const Tensor& input, IntArrayRef output_size, c10::optional scales) { upsample_nearest1d_kernel(kCPU, output, input, scales); - return output; } -Tensor& upsample_nearest1d_backward_out_cpu( +TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cpu) ( Tensor& grad_input, const Tensor& grad_output, IntArrayRef output_size, @@ -78,7 +81,6 @@ Tensor& upsample_nearest1d_backward_out_cpu( c10::optional scales) { grad_input.zero_(); upsample_nearest1d_backward_kernel(kCPU, grad_input, grad_output, scales); - return grad_input; } using at::native::upsample::compute_output_size; diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 36c01b2af49e..1792acffe57b 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -21,7 +21,7 @@ using namespace vec256; // Note: Undefined behavior when performing addition is intentionally // ignored. -void add_kernel(TensorIterator& iter, Scalar alpha_scalar) { +void add_kernel(TensorIteratorBase& iter, Scalar alpha_scalar) { if (iter.dtype() == ScalarType::Bool) { using scalar_t = bool; auto alpha = alpha_scalar.to(); diff --git a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu index 864fb0a848df..bbc85f7997e4 100644 --- a/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryAddSubKernel.cu @@ -18,7 +18,7 @@ struct AddFunctor { scalar_t alpha; }; -void add_kernel_cuda(TensorIterator& iter, Scalar alpha_scalar) { +void add_kernel_cuda(TensorIteratorBase& iter, Scalar alpha_scalar) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, iter.common_dtype(), "add_cuda/sub_cuda", [&]() { AddFunctor f(alpha_scalar.to()); gpu_kernel_with_scalars(iter, f); diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index 82765b2aeddb..e74debfb29be 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -152,6 +152,11 @@ void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { if (iter.is_cpu_scalar(1)) { AUnaryFunctor af(f, iter.scalar_value(1)); iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly const OptionalDeviceGuard device_guard(device_of(iter.tensor(1))); gpu_kernel(iter, af); } else if (iter.is_cpu_scalar(2)) { diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index ef287ca592da..99488108ac26 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -196,22 +196,15 @@ static void upsample_nearest1d_backward_out_cuda_template( } // namespace -Tensor& upsample_nearest1d_out_cuda( +TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) ( Tensor& output, const Tensor& input, IntArrayRef output_size, c10::optional scales) { upsample_nearest1d_out_cuda_template(output, input, output_size, scales); - return output; -} - -Tensor upsample_nearest1d_cuda(const Tensor& input, IntArrayRef output_size, c10::optional scales) { - Tensor output = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - upsample_nearest1d_out_cuda_template(output, input, output_size, scales); - return output; } -Tensor& upsample_nearest1d_backward_out_cuda( +TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cuda) ( Tensor& grad_input, const Tensor& grad_output, IntArrayRef output_size, @@ -219,18 +212,6 @@ Tensor& upsample_nearest1d_backward_out_cuda( c10::optional scales) { upsample_nearest1d_backward_out_cuda_template( grad_input, grad_output, output_size, input_size, scales); - return grad_input; -} - -Tensor upsample_nearest1d_backward_cuda( - const Tensor& grad_output, - IntArrayRef output_size, - IntArrayRef input_size, - c10::optional scales) { - Tensor grad_input = at::empty_like(grad_output, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - upsample_nearest1d_backward_out_cuda_template( - grad_input, grad_output, output_size, input_size, scales); - return grad_input; } using at::native::upsample::compute_output_size; diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm index bbcbfe10fd01..cfef7c16646c 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm +++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm @@ -378,7 +378,7 @@ bool test_add() { { \ auto X1 = torch::rand(a1, at::TensorOptions(at::kCPU).dtype(at::kFloat)); \ auto X2 = torch::rand(a2, at::TensorOptions(at::kCPU).dtype(at::kFloat)); \ - auto Y1 = at::native::add(X1, X2); \ + auto Y1 = at::add(X1, X2); \ auto MX1 = X1.metal(); \ auto MX2 = X2.metal(); \ auto Y2 = at::native::metal::mpscnn::add(MX1, MX2).cpu(); \ diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index a8bae757ab42..e39fce8e75aa 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -367,6 +367,7 @@ - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor use_c10_dispatcher: full + structured_delegate: add.out variants: function, method dispatch: CPU, CUDA: add @@ -376,12 +377,15 @@ - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) use_c10_dispatcher: full variants: method + structured_delegate: add.out dispatch: CPU, CUDA: add_ SparseCPU, SparseCUDA: add_sparse_ MkldnnCPU: mkldnn_add_ - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) + structured: True + structured_inherits: TensorIteratorBase dispatch: CPU, CUDA: add_out SparseCPU: add_out_sparse_cpu diff --git a/aten/src/ATen/templates/MetaFunctions.h b/aten/src/ATen/templates/MetaFunctions.h index d0489d1964f3..7ad20b734330 100644 --- a/aten/src/ATen/templates/MetaFunctions.h +++ b/aten/src/ATen/templates/MetaFunctions.h @@ -6,6 +6,7 @@ #include namespace at { + namespace meta { ${declarations} diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp index 31005b04308f..e923f6d73bd0 100644 --- a/aten/src/ATen/templates/RegisterDispatchKey.cpp +++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include #include diff --git a/benchmarks/static_runtime/deep_wide_pt.h b/benchmarks/static_runtime/deep_wide_pt.h index d6eae2f8b4ca..c473eaf1bb95 100644 --- a/benchmarks/static_runtime/deep_wide_pt.h +++ b/benchmarks/static_runtime/deep_wide_pt.h @@ -50,7 +50,7 @@ struct DeepAndWideFast : torch::nn::Module { torch::Tensor wide) { torch::NoGradGuard no_grad; if (!allocated) { - auto wide_offset = at::native::add(wide, mu_); + auto wide_offset = at::add(wide, mu_); auto wide_normalized = at::native::mul(wide_offset, sigma_); // Placeholder for ReplaceNaN auto wide_preproc = at::native::clamp(wide_normalized, -10.0, 10.0); @@ -82,7 +82,7 @@ struct DeepAndWideFast : torch::nn::Module { } else { // Potential optimization: add and mul could be fused together (e.g. with // Eigen). - at::native::add_out(prealloc_tensors[0], wide, mu_); + at::add_out(prealloc_tensors[0], wide, mu_); at::native::mul_out(prealloc_tensors[1], prealloc_tensors[0], sigma_); at::native::clamp_out( diff --git a/tools/codegen/api/meta.py b/tools/codegen/api/meta.py index 4bfc8e837ec1..a447850de38e 100644 --- a/tools/codegen/api/meta.py +++ b/tools/codegen/api/meta.py @@ -13,9 +13,9 @@ # - No tensor returns; instead we return a TensorMeta describing # the tensor in question -def name(f: FunctionSchema) -> str: - assert f.name.overload_name == "" - return str(f.name.name) +def name(g: StructuredNativeFunctions) -> str: + # use the overload name from the functional version + return str(g.functional.func.name).replace('.', '_') def argument_type(a: Argument) -> str: assert not a.is_write diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index 796cd68f9233..6ccc470e74b6 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -346,6 +346,10 @@ def defn(self, name: Optional[str] = None) -> str: name = self.name() return f"{self._returns_type} {name}({args_str})" + def ptr_type(self) -> str: + args_str = ', '.join(map(str, self.arguments())) + return f'{self._returns_type} (*)({args_str})' + def arguments(self) -> Tuple[NativeArgument, ...]: return self._arguments diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 3e5744207aff..d101c78b67f7 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -188,6 +188,11 @@ def is_generic_dispatch_key(dk: str) -> bool: def is_cuda_dispatch_key(dk: str) -> bool: return 'CUDA' in dk +# Structured kernel generation is only supported for certain key types; +# otherwise use old-style +def is_structured_dispatch_key(dk: str) -> bool: + return dk in {'CUDA', 'CPU'} + # Generates RegisterSchema.cpp. Depending on the selector, either # all schemas are registered, or only some are (in the case of # selective build) @@ -230,6 +235,9 @@ class RegisterDispatchKey: # registration code for. selector: SelectiveBuilder + # Whether or not we are actually code-genning for ROCm + rocm: bool + def __post_init__(self) -> None: assert self.target is not Target.DECLARATION @@ -243,6 +251,126 @@ def __call__(self, f: Union[StructuredNativeFunctions, NativeFunction]) -> List[ else: assert_never(f) + def gen_structured_class_set_output(self, k: SchemaKind, parent_class: str, generate_super: bool) -> str: + if generate_super: + set_output_super = f"{parent_class}::set_output(output_idx, sizes, strides, options, names);" + else: + set_output_super = "" + return f""" +void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, + TensorOptions options, DimnameList names) override {{ + {self.gen_structured_class_set_output_body(k)} + if (!names.empty()) namedinference::propagate_names(outputs_[output_idx], names); + // super must happen after, so that downstream can use maybe_get_output + // to retrieve the output + {set_output_super} +}} +""" + + def gen_structured_class_set_output_body(self, k: SchemaKind) -> str: + if self.dispatch_key == 'CUDA': + maybe_set_guard = """ +auto current_device = guard_.current_device(); +if (C10_UNLIKELY(current_device.has_value())) { + TORCH_INTERNAL_ASSERT(*current_device == options.device(), + "structured kernels don't support multi-device outputs"); +} else { + guard_.set_device(options.device()); +} +""" + else: + maybe_set_guard = '' + + if k is SchemaKind.functional: + if self.dispatch_key == "Meta": + return """ +if (strides.empty()) { + outputs_[output_idx] = at::empty_meta(sizes, options); +} else { + TORCH_INTERNAL_ASSERT(0, "not implemented yet"); +} +""" + else: + expanded_topts = "optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), " \ + "options.device_opt(), options.pinned_memory_opt()" + if self.dispatch_key == "CPU": + empty_impl = "at::native::empty_cpu" + empty_strided_impl = "at::native::empty_strided_cpu" + elif self.dispatch_key == "CUDA": + empty_impl = "at::native::empty_cuda" + empty_strided_impl = "at::native::empty_strided_cuda" + else: + raise AssertionError("unsupported dispatch key") + return f""" +{maybe_set_guard} +if (strides.empty()) {{ + outputs_[output_idx] = {empty_impl}(sizes, {expanded_topts}, options.memory_format_opt()); +}} else {{ + outputs_[output_idx] = {empty_strided_impl}(sizes, strides, {expanded_topts}); +}} +""" + elif k is SchemaKind.inplace: + return maybe_set_guard + elif k is SchemaKind.out: + return f""" +{maybe_set_guard} +at::native::resize_output(outputs_[output_idx], sizes); +if (!strides.empty()) {{ + TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); + at::native::as_strided_(outputs_[output_idx], sizes, strides); +}} else if (options.memory_format_opt().has_value()) {{ + outputs_[output_idx].get().unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); +}} +""" + else: + assert_never(k) + + # returns the definition of a ctor, as well as how to construct + # this class to a variable named op + def gen_structured_class_ctor(self, k: SchemaKind, class_name: str) -> str: + if k is SchemaKind.functional: + return "" + elif k is SchemaKind.inplace: + # TODO: Make sure out argument is guaranteed to be self + return f"{class_name}(Tensor& self) : outputs_{{std::ref(self)}} {{}}" + elif k is SchemaKind.out: + # TODO: Stop hardcoding out here + return f"{class_name}(Tensor& out) : outputs_{{std::ref(out)}} {{}}" + else: + assert_never(k) + + def gen_structured_class( + self, f: NativeFunction, k: SchemaKind, *, class_name: str, parent_class: str, generate_super: bool + ) -> str: + if k is SchemaKind.functional: + assert len(f.func.returns) == 1, "multi-return not supported yet" + output_type = "Tensor" + elif k is SchemaKind.inplace: + output_type = "std::reference_wrapper" + elif k is SchemaKind.out: + assert len(f.func.arguments.out) == 1, "multi-out structured not supported yet" + output_type = "std::reference_wrapper" + + if self.dispatch_key == 'CUDA': + if self.rocm: + guard_field = 'c10::hip::OptionalHIPGuardMasqueradingAsCUDA guard_;' + else: + guard_field = 'c10::cuda::OptionalCUDAGuard guard_;' + else: + guard_field = '' + + return f""" +struct {class_name} final : public {parent_class} {{ + {self.gen_structured_class_ctor(k, class_name)} + {self.gen_structured_class_set_output(k, parent_class, generate_super)} + const Tensor& maybe_get_output(int64_t output_idx) override {{ + return outputs_[output_idx]; + }} + std::array<{output_type}, {len(f.func.returns)}> outputs_; + {guard_field} +}}; +""" + def gen_structured(self, g: StructuredNativeFunctions) -> List[str]: if self.dispatch_key == 'Meta': assert self.dispatch_key not in g.out.dispatch, \ @@ -250,6 +378,8 @@ def gen_structured(self, g: StructuredNativeFunctions) -> List[str]: "functions, they will be automatically generated for you" elif self.dispatch_key not in g.out.dispatch: return [] + elif not is_structured_dispatch_key(self.dispatch_key): + return list(mapMaybe(self.gen_unstructured, g.functions())) # Inner helper function to close over g # TODO: This function has a lot of similarity with gen_unstructured. If @@ -261,7 +391,6 @@ def gen_one(f: NativeFunction) -> Optional[str]: # TODO: put this into StructuredNativeFunctions itself functional_func = g.out.func.signature() functional_sig = DispatcherSignature.from_schema(functional_func) - meta_name = meta.name(functional_func) # This is a little abusive; this assumes that the functionalization # transformation ALWAYS refers to valid arguments in the original @@ -276,60 +405,48 @@ def gen_one(f: NativeFunction) -> Optional[str]: sig = NativeSignature.from_schema(f.func) if self.target is Target.DEFINITION: - # TODO: work a little harder to generate fresh names for 'result' - # TODO: less praying that I picked the right argument name for 'self' + if self.dispatch_key == 'Meta': + class_name = f"structured_{meta.name(g)}_meta_{k.name}" + parent_class = f"at::meta::{meta.name(g)}" + else: + class_name = f"structured_{g.out.dispatch[self.dispatch_key]}_{k.name}" + parent_class = f"at::native::structured_{g.out.dispatch[self.dispatch_key]}" if k is SchemaKind.functional: - out_expr = "result" - if self.dispatch_key == "Meta": - prologue = "auto result = meta_tensor_from_meta(meta_result);" - else: - prologue = "auto result = tensor_from_meta(meta_result);" + assert len(f.func.returns) == 1, "multi-return not supported yet" + out_expr = "op.outputs_[0]" + ret_expr = "std::move(op.outputs_[0])" # small optimization + op_init = f"{class_name} op;" elif k is SchemaKind.inplace: out_expr = "self" - prologue = "// TODO: consistency check assert" + ret_expr = "self" + op_init = f"{class_name} op(self);" elif k is SchemaKind.out: - # TODO: generalize this for multi-out assert len(f.func.arguments.out) == 1, "multi-out structured not supported yet" - # TODO: properly get the expression as it was brought into - # scope by sig out_expr = f.func.arguments.out[0].name - prologue = f""" -// TODO: add a consistency check for meta_result -{out_expr}.resize_(meta_result.sizes); -""" + ret_expr = out_expr + op_init = f"{class_name} op({out_expr});" - if self.dispatch_key == "Meta": - out_impl_call = "// meta function does nothing" + if self.dispatch_key == 'Meta': + impl_call = "" else: - out_impl_name = f"at::native::{g.out.dispatch[self.dispatch_key]}" - out_impl_call = f"{out_impl_name}({out_expr}, {functional_exprs});" - - device_guard = "" - - if is_generic_dispatch_key(self.dispatch_key) or is_cuda_dispatch_key(self.dispatch_key): - # TODO: avoid copypasting the computation of self_args, - # candidate_args and device_of - self_args = (a for a in f.func.arguments.positional if a.name == "self") - candidate_args = itertools.chain(self_args, f.func.arguments.out, f.func.arguments.positional) - device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) - - device_guard = '' - if f.device_guard and device_of is not None: - # TODO: Use OptionalCUDAGuard when possible - device_guard = f"const OptionalDeviceGuard device_guard(device_of({device_of}));" - # TODO: figure out what to do about structured kernels and - # factory functions + impl_call = f"op.impl({out_expr}, {functional_exprs});" # For an overview of what this template code looks like, see # https://github.com/pytorch/rfcs/pull/9 return f"""\ +{self.gen_structured_class( + f, k, + class_name=class_name, + parent_class=parent_class, + generate_super=g.out.structured_inherits is not None +)} + {sig.defn()} {{ - {device_guard} - auto meta_result = meta::{meta_name}({functional_exprs}); - {prologue} - {out_impl_call} - return {out_expr}; + {op_init} + op.meta({functional_exprs}); + {impl_call} + return {ret_expr}; }} """ @@ -557,32 +674,73 @@ def compute_aten_op(f: NativeFunction) -> str: # Generates NativeFunctions.h, a list of forward declarations of all # actual kernel definitions we keep in aten/src/ATen/native/ @with_native_function -def compute_native_function_declaration(f: NativeFunction) -> List[str]: - ns = list(f.dispatch.values()) - - rs = [] - # Sometimes a function name shows up multiple times; only generate - # it once! - seen = set() - for n in ns: - if n in seen: - continue - if "legacy::" in n: - continue - seen.add(n) - returns_type = native.returns_type(f.func.returns) - args = native.arguments(f.func) - rs.append(f"CAFFE2_API {returns_type} {n}({', '.join(a.str_with_default() for a in args)});") +def compute_native_function_declaration(g: Union[StructuredNativeFunctions, NativeFunction]) -> List[str]: + if isinstance(g, StructuredNativeFunctions): + # only out has dispatch + meta_name = meta.name(g) + rs = [] + seen = set() + out_args = native.arguments(g.out.func) + for k, n in g.out.dispatch.items(): + if n in seen: + continue + if not is_structured_dispatch_key(k): + continue + seen.add(n) + rs.append(f"""\ +struct CAFFE2_API structured_{n} : public at::meta::{meta_name} {{ + void impl({', '.join(a.str_with_default() for a in out_args)}); +}}; +""") + + seen = set() + for f in g.functions(): + returns_type = native.returns_type(f.func.returns) + args = native.arguments(f.func) + for k, n in f.dispatch.items(): + if n in seen: + continue + if is_structured_dispatch_key(k): + continue + seen.add(n) + rs.append(f"CAFFE2_API {returns_type} {n}({', '.join(a.str_with_default() for a in args)});") + + return rs - return rs + else: + f = g + ns = list(f.dispatch.values()) + + rs = [] + # Sometimes a function name shows up multiple times; only generate + # it once! + seen = set() + for n in ns: + if n in seen: + continue + if "legacy::" in n: + continue + seen.add(n) + returns_type = native.returns_type(f.func.returns) + args = native.arguments(f.func) + rs.append(f"CAFFE2_API {returns_type} {n}({', '.join(a.str_with_default() for a in args)});") + + return rs def compute_meta_function_declaration(g: StructuredNativeFunctions) -> str: with native_function_manager(g.out): sig = g.signature() - name = meta.name(sig) - returns_type = meta.returns_type(sig.returns) + name = meta.name(g) args = meta.arguments(sig) - return f"CAFFE2_API {returns_type} {name}({', '.join(map(str, args))});" + args_str = ', '.join(map(str, args)) + parent_class = g.out.structured_inherits + if parent_class is None: + parent_class = "at::impl::MetaBase" + return f"""\ +struct CAFFE2_API {name} : public {parent_class} {{ + void meta({args_str}); +}}; +""" # Generates RegisterBackendSelect.cpp, a series of kernels which provide # specialized computation of dispatch key for operator signatures which cannot @@ -1097,11 +1255,13 @@ def make_file_manager(install_dir: str) -> FileManager: cuda_fm = make_file_manager(options.install_dir) extra_cuda_headers = '''\ +#include #include #include #include ''' if options.rocm: extra_cuda_headers = '''\ +#include #include #include #include ''' @@ -1138,11 +1298,11 @@ def make_file_manager(install_dir: str) -> FileManager: '', 'DispatchKey': dispatch_key, 'dispatch_definitions': list(concatMap( - RegisterDispatchKey(dispatch_key, Target.DEFINITION, selector), + RegisterDispatchKey(dispatch_key, Target.DEFINITION, selector, rocm=options.rocm), grouped_native_functions )), 'dispatch_registrations': list(concatMap( - RegisterDispatchKey(dispatch_key, Target.REGISTRATION, selector), + RegisterDispatchKey(dispatch_key, Target.REGISTRATION, selector, rocm=options.rocm), grouped_native_functions )), }) @@ -1183,7 +1343,7 @@ def make_file_manager(install_dir: str) -> FileManager: 'aten_ops': list(mapMaybe(compute_aten_op, native_functions)), }) cpu_fm.write('NativeFunctions.h', lambda: { - 'native_function_declarations': list(concatMap(compute_native_function_declaration, native_functions)), + 'native_function_declarations': list(concatMap(compute_native_function_declaration, grouped_native_functions)), }) cpu_fm.write('Declarations.yaml', lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions])) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 87cd3ab8e302..8b60dfb4806c 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -125,6 +125,12 @@ class NativeFunction: # in terms of the out kernel referenced by the string here. structured_delegate: Optional['OperatorName'] + # Only valid for structured kernels. Specifies alternative of what + # to inherit from when defining the meta class for the structured + # operator. This will usually be TensorIteratorBase. This also + # changes the semantics of set_output to call the parent class. + structured_inherits: Optional[str] + # Note [Abstract ATen methods] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # An abstract ATen method is one whose dispatch differs between @@ -194,6 +200,9 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': if structured_delegate_s is not None: structured_delegate = OperatorName.parse(structured_delegate_s) + structured_inherits = e.pop('structured_inherits', None) + assert structured_inherits is None or isinstance(structured_inherits, str), f'not a str: {structured_inherits}' + python_module = e.pop('python_module', None) assert python_module is None or isinstance(python_module, str), f'not a str: {python_module}' @@ -229,6 +238,7 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': variants=variants, structured=structured, structured_delegate=structured_delegate, + structured_inherits=structured_inherits, manual_kernel_registration=manual_kernel_registration, python_module=python_module, category_override=category_override, @@ -261,9 +271,11 @@ def __post_init__(self) -> None: if self.structured: assert self.func.kind() == SchemaKind.out, "Put structured field on the out= " \ "variant of a function; did you mean structured_delegate?" + assert self.device_guard, "device_guard: False is not respected by structured kernels" if self.structured_delegate: assert self.func.kind() != SchemaKind.out, "structured_delegate field not allowed " \ "on out= functions; did you mean structured?" + assert self.device_guard, "device_guard: False is not respected by structured kernels" # Technically, with the asserts above, this assert is impossible to # happen assert not (self.structured and self.structured_delegate), \ diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index a87eaca745d8..57db79699e07 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -50,6 +50,34 @@ bool canRunNatively(Node* n) { return true; } +// TODO: PLEASE DON'T COPY PASTE THIS, this is copy pasted +// generated code to unblock, need to make this nicer +struct static_add final : public at::native::structured_add_out { + static_add(at::Tensor& output) : output_(output) {} + void set_output( + int64_t output_idx, + at::IntArrayRef sizes, + at::IntArrayRef strides, + at::TensorOptions options, + at::DimnameList names) override { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx == 0); + // NB: do NOT use resize_output as it will complain if not zero sized. + at::native::resize_(output_, sizes); + if (!strides.empty()) { + TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); + output_.as_strided_(sizes, strides); + } else if (options.memory_format_opt().has_value()) { + output_.unsafeGetTensorImpl()->empty_tensor_restride( + *options.memory_format_opt()); + } + } + const at::Tensor& maybe_get_output(int64_t output_idx) override { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx == 0); + return output_; + } + at::Tensor& output_; +}; + REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { auto in0_t = p_node->Input(0, reg).toTensor(); @@ -59,8 +87,9 @@ REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { p_node->Output(0, reg) = create_empty_from(in0_t); } auto out_t = p_node->Output(0, reg).toTensor(); - out_t.resize_({0}); - at::native::add_out(out_t, in0_t, in1_t, in2_s); + static_add op{out_t}; + op.meta(in0_t, in1_t, in2_s); + op.impl(out_t, in0_t, in1_t, in2_s); }; }); From e9ef1fe30981cf0d49b591d221b905b39fda5715 Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Wed, 9 Dec 2020 16:55:46 -0800 Subject: [PATCH 092/250] [PyTorch Mobile] Add continuous build config for xplat/caffe2 Summary: Currently this folder isn't covered by continuous build and ideally, it should be covered. I've made everything that is actually used build, but there are test failures (commented out). Specifically: ### Build Failures 1. [Resolved] Vulkan stuff doesn't build because codegen doesn't generate files that Vulkan expects. 2. [Resolved] Vulkan relies in Android dev environment being set up, which doesn't exist on sandcastle machines. I think the resolution should be to restrict Vulkan stuff to the ANDROID platform, but will let AshkanAliabadi (who is the expect on all things Vulkan) provide the appropriate resoltion. 3. [Resolved] Older caffe2 stuff didn't have the deps set up correctly for zlib. 4. [Resolved] Some Papaya stuff didn't have the QPL deps set up correctly. 5. [Resolved] Some tests include cuda, which isn't available on xplat PyTorch Mobile. 6. [Resolved] Missing NNPACK dep on platforms other than ANDROID and MACOS. 7. [Resolved] Maskrcnn binary missing header includes. 8. [Resolved] Braces around scalar initializers in Vulkan Tests. 9. [Resolved] Incorrect header `` and incorrect BUCK glob path to include it - seems like some completely different header was being included by libvulkan-stub. ### Test Failures 1. [Resolved] Memory Leak on exit in multiple (all?) QNNPACK tests. 2. [Unresolved] Lite Trainer test doesn't explicitly specify dep on input `.ptl` file, resulting in the file not being found in the test when the test attempts to open it. 3. [Resolved] Heap Use after free errors in old caffe2 tests. 4. [Resolved] Heap buffer overflow errors in old caffe2 tests. 5. [Unresolved] Something related to an overload of `at::Tensor` accepting C2 Tensor not being found (new PyTorch test I think). Everything marked `[Unresolved]` above results in stuff that is commented out so that it isn't triggered. It is already currently broken, so it doesn't represent a regression - merely an explicit indication of the fact that it's broken. Everything marked `[Resolved]` above means that it was fixed to function as intended based on my understanding of the intent. Test Plan: Sandcastle. Reviewed By: iseeyuan Differential Revision: D25093853 fbshipit-source-id: e0dda4f3d852ef158cd088ae2cfd44019ade1573 --- aten/src/ATen/test/vulkan_test.cpp | 4 +- .../libvulkan-stub/include/vulkan/vulkan.h | 2971 +++++++++++++++-- 2 files changed, 2636 insertions(+), 339 deletions(-) diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp index 7c4e96f7f1a6..6b066b4337be 100644 --- a/aten/src/ATen/test/vulkan_test.cpp +++ b/aten/src/ATen/test/vulkan_test.cpp @@ -927,10 +927,10 @@ TEST(VulkanTest, avg_pool2d) { auto t_in = at::rand({1, 3, 7, 7}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); - auto t_out_expected = at::avg_pool2d(t_in, {2, 2}, {1}, {0}, {1}); + auto t_out_expected = at::avg_pool2d(t_in, {2, 2}, {1}, {0}, true); auto tv_in = t_in.vulkan(); - auto tv_out = at::avg_pool2d(tv_in, {2, 2}, {1}, {0}, {1}); + auto tv_out = at::avg_pool2d(tv_in, {2, 2}, {1}, {0}, true); auto t_out = tv_out.cpu(); const auto check = almostEqual(t_out, t_out_expected); diff --git a/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h b/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h index cc0ad1f72d01..04495fa0cd72 100644 --- a/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h +++ b/caffe2/mobile/contrib/libvulkan-stub/include/vulkan/vulkan.h @@ -6,7 +6,7 @@ extern "C" { #endif /* -** Copyright (c) 2015-2016 The Khronos Group Inc. +** Copyright (c) 2015-2017 The Khronos Group Inc. ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. @@ -28,22 +28,22 @@ extern "C" { #define VK_VERSION_1_0 1 -#include "vk_platform.h" +#include "./vk_platform.h" #define VK_MAKE_VERSION(major, minor, patch) \ (((major) << 22) | ((minor) << 12) | (patch)) // DEPRECATED: This define has been removed. Specific version defines (e.g. VK_API_VERSION_1_0), or the VK_MAKE_VERSION macro, should be used instead. -//#define VK_API_VERSION VK_MAKE_VERSION(1, 0, 0) +//#define VK_API_VERSION VK_MAKE_VERSION(1, 0, 0) // Patch version should always be set to 0 // Vulkan 1.0 version number -#define VK_API_VERSION_1_0 VK_MAKE_VERSION(1, 0, 0) +#define VK_API_VERSION_1_0 VK_MAKE_VERSION(1, 0, 0)// Patch version should always be set to 0 #define VK_VERSION_MAJOR(version) ((uint32_t)(version) >> 22) #define VK_VERSION_MINOR(version) (((uint32_t)(version) >> 12) & 0x3ff) #define VK_VERSION_PATCH(version) ((uint32_t)(version) & 0xfff) // Version of this file -#define VK_HEADER_VERSION 29 +#define VK_HEADER_VERSION 59 #define VK_NULL_HANDLE 0 @@ -145,6 +145,8 @@ typedef enum VkResult { VK_ERROR_INCOMPATIBLE_DISPLAY_KHR = -1000003001, VK_ERROR_VALIDATION_FAILED_EXT = -1000011001, VK_ERROR_INVALID_SHADER_NV = -1000012000, + VK_ERROR_OUT_OF_POOL_MEMORY_KHR = -1000069000, + VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR = -1000072003, VK_RESULT_BEGIN_RANGE = VK_ERROR_FRAGMENTED_POOL, VK_RESULT_END_RANGE = VK_INCOMPLETE, VK_RESULT_RANGE_SIZE = (VK_INCOMPLETE - VK_ERROR_FRAGMENTED_POOL + 1), @@ -220,12 +222,117 @@ typedef enum VkStructureType { VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_IMAGE_CREATE_INFO_NV = 1000026000, VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_BUFFER_CREATE_INFO_NV = 1000026001, VK_STRUCTURE_TYPE_DEDICATED_ALLOCATION_MEMORY_ALLOCATE_INFO_NV = 1000026002, + VK_STRUCTURE_TYPE_TEXTURE_LOD_GATHER_FORMAT_PROPERTIES_AMD = 1000041000, + VK_STRUCTURE_TYPE_RENDER_PASS_MULTIVIEW_CREATE_INFO_KHX = 1000053000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_FEATURES_KHX = 1000053001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PROPERTIES_KHX = 1000053002, VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_NV = 1000056000, VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_NV = 1000056001, VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_NV = 1000057000, VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_NV = 1000057001, VK_STRUCTURE_TYPE_WIN32_KEYED_MUTEX_ACQUIRE_RELEASE_INFO_NV = 1000058000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR = 1000059000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR = 1000059001, + VK_STRUCTURE_TYPE_FORMAT_PROPERTIES_2_KHR = 1000059002, + VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2_KHR = 1000059003, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2_KHR = 1000059004, + VK_STRUCTURE_TYPE_QUEUE_FAMILY_PROPERTIES_2_KHR = 1000059005, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR = 1000059006, + VK_STRUCTURE_TYPE_SPARSE_IMAGE_FORMAT_PROPERTIES_2_KHR = 1000059007, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SPARSE_IMAGE_FORMAT_INFO_2_KHR = 1000059008, + VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHX = 1000060000, + VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO_KHX = 1000060001, + VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO_KHX = 1000060002, + VK_STRUCTURE_TYPE_DEVICE_GROUP_RENDER_PASS_BEGIN_INFO_KHX = 1000060003, + VK_STRUCTURE_TYPE_DEVICE_GROUP_COMMAND_BUFFER_BEGIN_INFO_KHX = 1000060004, + VK_STRUCTURE_TYPE_DEVICE_GROUP_SUBMIT_INFO_KHX = 1000060005, + VK_STRUCTURE_TYPE_DEVICE_GROUP_BIND_SPARSE_INFO_KHX = 1000060006, + VK_STRUCTURE_TYPE_DEVICE_GROUP_PRESENT_CAPABILITIES_KHX = 1000060007, + VK_STRUCTURE_TYPE_IMAGE_SWAPCHAIN_CREATE_INFO_KHX = 1000060008, + VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_SWAPCHAIN_INFO_KHX = 1000060009, + VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHX = 1000060010, + VK_STRUCTURE_TYPE_DEVICE_GROUP_PRESENT_INFO_KHX = 1000060011, + VK_STRUCTURE_TYPE_DEVICE_GROUP_SWAPCHAIN_CREATE_INFO_KHX = 1000060012, VK_STRUCTURE_TYPE_VALIDATION_FLAGS_EXT = 1000061000, + VK_STRUCTURE_TYPE_VI_SURFACE_CREATE_INFO_NN = 1000062000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_GROUP_PROPERTIES_KHX = 1000070000, + VK_STRUCTURE_TYPE_DEVICE_GROUP_DEVICE_CREATE_INFO_KHX = 1000070001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO_KHR = 1000071000, + VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES_KHR = 1000071001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO_KHR = 1000071002, + VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES_KHR = 1000071003, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES_KHR = 1000071004, + VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO_KHR = 1000072000, + VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR = 1000072001, + VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR = 1000072002, + VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR = 1000073000, + VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR = 1000073001, + VK_STRUCTURE_TYPE_MEMORY_WIN32_HANDLE_PROPERTIES_KHR = 1000073002, + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR = 1000073003, + VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR = 1000074000, + VK_STRUCTURE_TYPE_MEMORY_FD_PROPERTIES_KHR = 1000074001, + VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR = 1000074002, + VK_STRUCTURE_TYPE_WIN32_KEYED_MUTEX_ACQUIRE_RELEASE_INFO_KHR = 1000075000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_SEMAPHORE_INFO_KHR = 1000076000, + VK_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_PROPERTIES_KHR = 1000076001, + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR = 1000077000, + VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR = 1000078000, + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR = 1000078001, + VK_STRUCTURE_TYPE_D3D12_FENCE_SUBMIT_INFO_KHR = 1000078002, + VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR = 1000078003, + VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_FD_INFO_KHR = 1000079000, + VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR = 1000079001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR = 1000080000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR = 1000083000, + VK_STRUCTURE_TYPE_PRESENT_REGIONS_KHR = 1000084000, + VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR = 1000085000, + VK_STRUCTURE_TYPE_OBJECT_TABLE_CREATE_INFO_NVX = 1000086000, + VK_STRUCTURE_TYPE_INDIRECT_COMMANDS_LAYOUT_CREATE_INFO_NVX = 1000086001, + VK_STRUCTURE_TYPE_CMD_PROCESS_COMMANDS_INFO_NVX = 1000086002, + VK_STRUCTURE_TYPE_CMD_RESERVE_SPACE_FOR_COMMANDS_INFO_NVX = 1000086003, + VK_STRUCTURE_TYPE_DEVICE_GENERATED_COMMANDS_LIMITS_NVX = 1000086004, + VK_STRUCTURE_TYPE_DEVICE_GENERATED_COMMANDS_FEATURES_NVX = 1000086005, + VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_W_SCALING_STATE_CREATE_INFO_NV = 1000087000, + VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_EXT = 1000090000, + VK_STRUCTURE_TYPE_DISPLAY_POWER_INFO_EXT = 1000091000, + VK_STRUCTURE_TYPE_DEVICE_EVENT_INFO_EXT = 1000091001, + VK_STRUCTURE_TYPE_DISPLAY_EVENT_INFO_EXT = 1000091002, + VK_STRUCTURE_TYPE_SWAPCHAIN_COUNTER_CREATE_INFO_EXT = 1000091003, + VK_STRUCTURE_TYPE_PRESENT_TIMES_INFO_GOOGLE = 1000092000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MULTIVIEW_PER_VIEW_ATTRIBUTES_PROPERTIES_NVX = 1000097000, + VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV = 1000098000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DISCARD_RECTANGLE_PROPERTIES_EXT = 1000099000, + VK_STRUCTURE_TYPE_PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT = 1000099001, + VK_STRUCTURE_TYPE_HDR_METADATA_EXT = 1000105000, + VK_STRUCTURE_TYPE_SHARED_PRESENT_SURFACE_CAPABILITIES_KHR = 1000111000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO_KHR = 1000112000, + VK_STRUCTURE_TYPE_EXTERNAL_FENCE_PROPERTIES_KHR = 1000112001, + VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO_KHR = 1000113000, + VK_STRUCTURE_TYPE_IMPORT_FENCE_WIN32_HANDLE_INFO_KHR = 1000114000, + VK_STRUCTURE_TYPE_EXPORT_FENCE_WIN32_HANDLE_INFO_KHR = 1000114001, + VK_STRUCTURE_TYPE_FENCE_GET_WIN32_HANDLE_INFO_KHR = 1000114002, + VK_STRUCTURE_TYPE_IMPORT_FENCE_FD_INFO_KHR = 1000115000, + VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR = 1000115001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SURFACE_INFO_2_KHR = 1000119000, + VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR = 1000119001, + VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR = 1000119002, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR = 1000120000, + VK_STRUCTURE_TYPE_IOS_SURFACE_CREATE_INFO_MVK = 1000122000, + VK_STRUCTURE_TYPE_MACOS_SURFACE_CREATE_INFO_MVK = 1000123000, + VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR = 1000127000, + VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR = 1000127001, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_FILTER_MINMAX_PROPERTIES_EXT = 1000130000, + VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT = 1000130001, + VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR = 1000146000, + VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR = 1000146001, + VK_STRUCTURE_TYPE_IMAGE_SPARSE_MEMORY_REQUIREMENTS_INFO_2_KHR = 1000146002, + VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR = 1000146003, + VK_STRUCTURE_TYPE_SPARSE_IMAGE_MEMORY_REQUIREMENTS_2_KHR = 1000146004, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_FEATURES_EXT = 1000148000, + VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BLEND_OPERATION_ADVANCED_PROPERTIES_EXT = 1000148001, + VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_ADVANCED_STATE_CREATE_INFO_EXT = 1000148002, + VK_STRUCTURE_TYPE_PIPELINE_COVERAGE_TO_COLOR_STATE_CREATE_INFO_NV = 1000149000, + VK_STRUCTURE_TYPE_PIPELINE_COVERAGE_MODULATION_STATE_CREATE_INFO_NV = 1000152000, VK_STRUCTURE_TYPE_BEGIN_RANGE = VK_STRUCTURE_TYPE_APPLICATION_INFO, VK_STRUCTURE_TYPE_END_RANGE = VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO, VK_STRUCTURE_TYPE_RANGE_SIZE = (VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO - VK_STRUCTURE_TYPE_APPLICATION_INFO + 1), @@ -513,6 +620,7 @@ typedef enum VkImageLayout { VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL = 7, VK_IMAGE_LAYOUT_PREINITIALIZED = 8, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR = 1000001002, + VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR = 1000111000, VK_IMAGE_LAYOUT_BEGIN_RANGE = VK_IMAGE_LAYOUT_UNDEFINED, VK_IMAGE_LAYOUT_END_RANGE = VK_IMAGE_LAYOUT_PREINITIALIZED, VK_IMAGE_LAYOUT_RANGE_SIZE = (VK_IMAGE_LAYOUT_PREINITIALIZED - VK_IMAGE_LAYOUT_UNDEFINED + 1), @@ -578,6 +686,7 @@ typedef enum VkPolygonMode { VK_POLYGON_MODE_FILL = 0, VK_POLYGON_MODE_LINE = 1, VK_POLYGON_MODE_POINT = 2, + VK_POLYGON_MODE_FILL_RECTANGLE_NV = 1000153000, VK_POLYGON_MODE_BEGIN_RANGE = VK_POLYGON_MODE_FILL, VK_POLYGON_MODE_END_RANGE = VK_POLYGON_MODE_POINT, VK_POLYGON_MODE_RANGE_SIZE = (VK_POLYGON_MODE_POINT - VK_POLYGON_MODE_FILL + 1), @@ -678,6 +787,52 @@ typedef enum VkBlendOp { VK_BLEND_OP_REVERSE_SUBTRACT = 2, VK_BLEND_OP_MIN = 3, VK_BLEND_OP_MAX = 4, + VK_BLEND_OP_ZERO_EXT = 1000148000, + VK_BLEND_OP_SRC_EXT = 1000148001, + VK_BLEND_OP_DST_EXT = 1000148002, + VK_BLEND_OP_SRC_OVER_EXT = 1000148003, + VK_BLEND_OP_DST_OVER_EXT = 1000148004, + VK_BLEND_OP_SRC_IN_EXT = 1000148005, + VK_BLEND_OP_DST_IN_EXT = 1000148006, + VK_BLEND_OP_SRC_OUT_EXT = 1000148007, + VK_BLEND_OP_DST_OUT_EXT = 1000148008, + VK_BLEND_OP_SRC_ATOP_EXT = 1000148009, + VK_BLEND_OP_DST_ATOP_EXT = 1000148010, + VK_BLEND_OP_XOR_EXT = 1000148011, + VK_BLEND_OP_MULTIPLY_EXT = 1000148012, + VK_BLEND_OP_SCREEN_EXT = 1000148013, + VK_BLEND_OP_OVERLAY_EXT = 1000148014, + VK_BLEND_OP_DARKEN_EXT = 1000148015, + VK_BLEND_OP_LIGHTEN_EXT = 1000148016, + VK_BLEND_OP_COLORDODGE_EXT = 1000148017, + VK_BLEND_OP_COLORBURN_EXT = 1000148018, + VK_BLEND_OP_HARDLIGHT_EXT = 1000148019, + VK_BLEND_OP_SOFTLIGHT_EXT = 1000148020, + VK_BLEND_OP_DIFFERENCE_EXT = 1000148021, + VK_BLEND_OP_EXCLUSION_EXT = 1000148022, + VK_BLEND_OP_INVERT_EXT = 1000148023, + VK_BLEND_OP_INVERT_RGB_EXT = 1000148024, + VK_BLEND_OP_LINEARDODGE_EXT = 1000148025, + VK_BLEND_OP_LINEARBURN_EXT = 1000148026, + VK_BLEND_OP_VIVIDLIGHT_EXT = 1000148027, + VK_BLEND_OP_LINEARLIGHT_EXT = 1000148028, + VK_BLEND_OP_PINLIGHT_EXT = 1000148029, + VK_BLEND_OP_HARDMIX_EXT = 1000148030, + VK_BLEND_OP_HSL_HUE_EXT = 1000148031, + VK_BLEND_OP_HSL_SATURATION_EXT = 1000148032, + VK_BLEND_OP_HSL_COLOR_EXT = 1000148033, + VK_BLEND_OP_HSL_LUMINOSITY_EXT = 1000148034, + VK_BLEND_OP_PLUS_EXT = 1000148035, + VK_BLEND_OP_PLUS_CLAMPED_EXT = 1000148036, + VK_BLEND_OP_PLUS_CLAMPED_ALPHA_EXT = 1000148037, + VK_BLEND_OP_PLUS_DARKER_EXT = 1000148038, + VK_BLEND_OP_MINUS_EXT = 1000148039, + VK_BLEND_OP_MINUS_CLAMPED_EXT = 1000148040, + VK_BLEND_OP_CONTRAST_EXT = 1000148041, + VK_BLEND_OP_INVERT_OVG_EXT = 1000148042, + VK_BLEND_OP_RED_EXT = 1000148043, + VK_BLEND_OP_GREEN_EXT = 1000148044, + VK_BLEND_OP_BLUE_EXT = 1000148045, VK_BLEND_OP_BEGIN_RANGE = VK_BLEND_OP_ADD, VK_BLEND_OP_END_RANGE = VK_BLEND_OP_MAX, VK_BLEND_OP_RANGE_SIZE = (VK_BLEND_OP_MAX - VK_BLEND_OP_ADD + 1), @@ -694,6 +849,8 @@ typedef enum VkDynamicState { VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK = 6, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK = 7, VK_DYNAMIC_STATE_STENCIL_REFERENCE = 8, + VK_DYNAMIC_STATE_VIEWPORT_W_SCALING_NV = 1000087000, + VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT = 1000099000, VK_DYNAMIC_STATE_BEGIN_RANGE = VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_END_RANGE = VK_DYNAMIC_STATE_STENCIL_REFERENCE, VK_DYNAMIC_STATE_RANGE_SIZE = (VK_DYNAMIC_STATE_STENCIL_REFERENCE - VK_DYNAMIC_STATE_VIEWPORT + 1), @@ -817,6 +974,47 @@ typedef enum VkSubpassContents { VK_SUBPASS_CONTENTS_MAX_ENUM = 0x7FFFFFFF } VkSubpassContents; +typedef enum VkObjectType { + VK_OBJECT_TYPE_UNKNOWN = 0, + VK_OBJECT_TYPE_INSTANCE = 1, + VK_OBJECT_TYPE_PHYSICAL_DEVICE = 2, + VK_OBJECT_TYPE_DEVICE = 3, + VK_OBJECT_TYPE_QUEUE = 4, + VK_OBJECT_TYPE_SEMAPHORE = 5, + VK_OBJECT_TYPE_COMMAND_BUFFER = 6, + VK_OBJECT_TYPE_FENCE = 7, + VK_OBJECT_TYPE_DEVICE_MEMORY = 8, + VK_OBJECT_TYPE_BUFFER = 9, + VK_OBJECT_TYPE_IMAGE = 10, + VK_OBJECT_TYPE_EVENT = 11, + VK_OBJECT_TYPE_QUERY_POOL = 12, + VK_OBJECT_TYPE_BUFFER_VIEW = 13, + VK_OBJECT_TYPE_IMAGE_VIEW = 14, + VK_OBJECT_TYPE_SHADER_MODULE = 15, + VK_OBJECT_TYPE_PIPELINE_CACHE = 16, + VK_OBJECT_TYPE_PIPELINE_LAYOUT = 17, + VK_OBJECT_TYPE_RENDER_PASS = 18, + VK_OBJECT_TYPE_PIPELINE = 19, + VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT = 20, + VK_OBJECT_TYPE_SAMPLER = 21, + VK_OBJECT_TYPE_DESCRIPTOR_POOL = 22, + VK_OBJECT_TYPE_DESCRIPTOR_SET = 23, + VK_OBJECT_TYPE_FRAMEBUFFER = 24, + VK_OBJECT_TYPE_COMMAND_POOL = 25, + VK_OBJECT_TYPE_SURFACE_KHR = 1000000000, + VK_OBJECT_TYPE_SWAPCHAIN_KHR = 1000001000, + VK_OBJECT_TYPE_DISPLAY_KHR = 1000002000, + VK_OBJECT_TYPE_DISPLAY_MODE_KHR = 1000002001, + VK_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT = 1000011000, + VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR = 1000085000, + VK_OBJECT_TYPE_OBJECT_TABLE_NVX = 1000086000, + VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX = 1000086001, + VK_OBJECT_TYPE_BEGIN_RANGE = VK_OBJECT_TYPE_UNKNOWN, + VK_OBJECT_TYPE_END_RANGE = VK_OBJECT_TYPE_COMMAND_POOL, + VK_OBJECT_TYPE_RANGE_SIZE = (VK_OBJECT_TYPE_COMMAND_POOL - VK_OBJECT_TYPE_UNKNOWN + 1), + VK_OBJECT_TYPE_MAX_ENUM = 0x7FFFFFFF +} VkObjectType; + typedef VkFlags VkInstanceCreateFlags; typedef enum VkFormatFeatureFlagBits { @@ -834,6 +1032,9 @@ typedef enum VkFormatFeatureFlagBits { VK_FORMAT_FEATURE_BLIT_DST_BIT = 0x00000800, VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT = 0x00001000, VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_CUBIC_BIT_IMG = 0x00002000, + VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR = 0x00004000, + VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR = 0x00008000, + VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_MINMAX_BIT_EXT = 0x00010000, VK_FORMAT_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkFormatFeatureFlagBits; typedef VkFlags VkFormatFeatureFlags; @@ -857,6 +1058,8 @@ typedef enum VkImageCreateFlagBits { VK_IMAGE_CREATE_SPARSE_ALIASED_BIT = 0x00000004, VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT = 0x00000008, VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT = 0x00000010, + VK_IMAGE_CREATE_BIND_SFR_BIT_KHX = 0x00000040, + VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT_KHR = 0x00000020, VK_IMAGE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkImageCreateFlagBits; typedef VkFlags VkImageCreateFlags; @@ -894,6 +1097,7 @@ typedef VkFlags VkMemoryPropertyFlags; typedef enum VkMemoryHeapFlagBits { VK_MEMORY_HEAP_DEVICE_LOCAL_BIT = 0x00000001, + VK_MEMORY_HEAP_MULTI_INSTANCE_BIT_KHX = 0x00000002, VK_MEMORY_HEAP_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkMemoryHeapFlagBits; typedef VkFlags VkMemoryHeapFlags; @@ -918,6 +1122,7 @@ typedef enum VkPipelineStageFlagBits { VK_PIPELINE_STAGE_HOST_BIT = 0x00004000, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT = 0x00008000, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT = 0x00010000, + VK_PIPELINE_STAGE_COMMAND_PROCESS_BIT_NVX = 0x00020000, VK_PIPELINE_STAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkPipelineStageFlagBits; typedef VkFlags VkPipelineStageFlags; @@ -1010,6 +1215,8 @@ typedef enum VkPipelineCreateFlagBits { VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT = 0x00000001, VK_PIPELINE_CREATE_ALLOW_DERIVATIVES_BIT = 0x00000002, VK_PIPELINE_CREATE_DERIVATIVE_BIT = 0x00000004, + VK_PIPELINE_CREATE_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHX = 0x00000008, + VK_PIPELINE_CREATE_DISPATCH_BASE_KHX = 0x00000010, VK_PIPELINE_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkPipelineCreateFlagBits; typedef VkFlags VkPipelineCreateFlags; @@ -1056,6 +1263,11 @@ typedef VkFlags VkPipelineDynamicStateCreateFlags; typedef VkFlags VkPipelineLayoutCreateFlags; typedef VkFlags VkShaderStageFlags; typedef VkFlags VkSamplerCreateFlags; + +typedef enum VkDescriptorSetLayoutCreateFlagBits { + VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR = 0x00000001, + VK_DESCRIPTOR_SET_LAYOUT_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkDescriptorSetLayoutCreateFlagBits; typedef VkFlags VkDescriptorSetLayoutCreateFlags; typedef enum VkDescriptorPoolCreateFlagBits { @@ -1072,6 +1284,12 @@ typedef enum VkAttachmentDescriptionFlagBits { VK_ATTACHMENT_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkAttachmentDescriptionFlagBits; typedef VkFlags VkAttachmentDescriptionFlags; + +typedef enum VkSubpassDescriptionFlagBits { + VK_SUBPASS_DESCRIPTION_PER_VIEW_ATTRIBUTES_BIT_NVX = 0x00000001, + VK_SUBPASS_DESCRIPTION_PER_VIEW_POSITION_X_ONLY_BIT_NVX = 0x00000002, + VK_SUBPASS_DESCRIPTION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VkSubpassDescriptionFlagBits; typedef VkFlags VkSubpassDescriptionFlags; typedef enum VkAccessFlagBits { @@ -1092,12 +1310,17 @@ typedef enum VkAccessFlagBits { VK_ACCESS_HOST_WRITE_BIT = 0x00004000, VK_ACCESS_MEMORY_READ_BIT = 0x00008000, VK_ACCESS_MEMORY_WRITE_BIT = 0x00010000, + VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX = 0x00020000, + VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX = 0x00040000, + VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT = 0x00080000, VK_ACCESS_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkAccessFlagBits; typedef VkFlags VkAccessFlags; typedef enum VkDependencyFlagBits { VK_DEPENDENCY_BY_REGION_BIT = 0x00000001, + VK_DEPENDENCY_VIEW_LOCAL_BIT_KHX = 0x00000002, + VK_DEPENDENCY_DEVICE_GROUP_BIT_KHX = 0x00000004, VK_DEPENDENCY_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF } VkDependencyFlagBits; typedef VkFlags VkDependencyFlags; @@ -1143,6 +1366,27 @@ typedef enum VkStencilFaceFlagBits { } VkStencilFaceFlagBits; typedef VkFlags VkStencilFaceFlags; +typedef struct VkApplicationInfo { + VkStructureType sType; + const void* pNext; + const char* pApplicationName; + uint32_t applicationVersion; + const char* pEngineName; + uint32_t engineVersion; + uint32_t apiVersion; +} VkApplicationInfo; + +typedef struct VkInstanceCreateInfo { + VkStructureType sType; + const void* pNext; + VkInstanceCreateFlags flags; + const VkApplicationInfo* pApplicationInfo; + uint32_t enabledLayerCount; + const char* const* ppEnabledLayerNames; + uint32_t enabledExtensionCount; + const char* const* ppEnabledExtensionNames; +} VkInstanceCreateInfo; + typedef void* (VKAPI_PTR *PFN_vkAllocationFunction)( void* pUserData, size_t size, @@ -1172,29 +1416,6 @@ typedef void (VKAPI_PTR *PFN_vkInternalFreeNotification)( VkInternalAllocationType allocationType, VkSystemAllocationScope allocationScope); -typedef void (VKAPI_PTR *PFN_vkVoidFunction)(void); - -typedef struct VkApplicationInfo { - VkStructureType sType; - const void* pNext; - const char* pApplicationName; - uint32_t applicationVersion; - const char* pEngineName; - uint32_t engineVersion; - uint32_t apiVersion; -} VkApplicationInfo; - -typedef struct VkInstanceCreateInfo { - VkStructureType sType; - const void* pNext; - VkInstanceCreateFlags flags; - const VkApplicationInfo* pApplicationInfo; - uint32_t enabledLayerCount; - const char* const* ppEnabledLayerNames; - uint32_t enabledExtensionCount; - const char* const* ppEnabledExtensionNames; -} VkInstanceCreateInfo; - typedef struct VkAllocationCallbacks { void* pUserData; PFN_vkAllocationFunction pfnAllocation; @@ -1435,6 +1656,7 @@ typedef struct VkPhysicalDeviceMemoryProperties { VkMemoryHeap memoryHeaps[VK_MAX_MEMORY_HEAPS]; } VkPhysicalDeviceMemoryProperties; +typedef void (VKAPI_PTR *PFN_vkVoidFunction)(void); typedef struct VkDeviceQueueCreateInfo { VkStructureType sType; const void* pNext; @@ -2360,7 +2582,7 @@ typedef void (VKAPI_PTR *PFN_vkCmdDraw)(VkCommandBuffer commandBuffer, uint32_t typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexed)(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance); typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride); typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t drawCount, uint32_t stride); -typedef void (VKAPI_PTR *PFN_vkCmdDispatch)(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z); +typedef void (VKAPI_PTR *PFN_vkCmdDispatch)(VkCommandBuffer commandBuffer, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ); typedef void (VKAPI_PTR *PFN_vkCmdDispatchIndirect)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset); typedef void (VKAPI_PTR *PFN_vkCmdCopyBuffer)(VkCommandBuffer commandBuffer, VkBuffer srcBuffer, VkBuffer dstBuffer, uint32_t regionCount, const VkBufferCopy* pRegions); typedef void (VKAPI_PTR *PFN_vkCmdCopyImage)(VkCommandBuffer commandBuffer, VkImage srcImage, VkImageLayout srcImageLayout, VkImage dstImage, VkImageLayout dstImageLayout, uint32_t regionCount, const VkImageCopy* pRegions); @@ -2996,9 +3218,9 @@ VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirect( VKAPI_ATTR void VKAPI_CALL vkCmdDispatch( VkCommandBuffer commandBuffer, - uint32_t x, - uint32_t y, - uint32_t z); + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ); VKAPI_ATTR void VKAPI_CALL vkCmdDispatchIndirect( VkCommandBuffer commandBuffer, @@ -3197,6 +3419,20 @@ VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSurfaceKHR) typedef enum VkColorSpaceKHR { VK_COLOR_SPACE_SRGB_NONLINEAR_KHR = 0, + VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT = 1000104001, + VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT = 1000104002, + VK_COLOR_SPACE_DCI_P3_LINEAR_EXT = 1000104003, + VK_COLOR_SPACE_DCI_P3_NONLINEAR_EXT = 1000104004, + VK_COLOR_SPACE_BT709_LINEAR_EXT = 1000104005, + VK_COLOR_SPACE_BT709_NONLINEAR_EXT = 1000104006, + VK_COLOR_SPACE_BT2020_LINEAR_EXT = 1000104007, + VK_COLOR_SPACE_HDR10_ST2084_EXT = 1000104008, + VK_COLOR_SPACE_DOLBYVISION_EXT = 1000104009, + VK_COLOR_SPACE_HDR10_HLG_EXT = 1000104010, + VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT = 1000104011, + VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT = 1000104012, + VK_COLOR_SPACE_PASS_THROUGH_EXT = 1000104013, + VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT = 1000104014, VK_COLOR_SPACE_BEGIN_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, VK_COLOR_SPACE_END_RANGE_KHR = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR, VK_COLOR_SPACE_RANGE_SIZE_KHR = (VK_COLOR_SPACE_SRGB_NONLINEAR_KHR - VK_COLOR_SPACE_SRGB_NONLINEAR_KHR + 1), @@ -3208,6 +3444,8 @@ typedef enum VkPresentModeKHR { VK_PRESENT_MODE_MAILBOX_KHR = 1, VK_PRESENT_MODE_FIFO_KHR = 2, VK_PRESENT_MODE_FIFO_RELAXED_KHR = 3, + VK_PRESENT_MODE_SHARED_DEMAND_REFRESH_KHR = 1000111000, + VK_PRESENT_MODE_SHARED_CONTINUOUS_REFRESH_KHR = 1000111001, VK_PRESENT_MODE_BEGIN_RANGE_KHR = VK_PRESENT_MODE_IMMEDIATE_KHR, VK_PRESENT_MODE_END_RANGE_KHR = VK_PRESENT_MODE_FIFO_RELAXED_KHR, VK_PRESENT_MODE_RANGE_SIZE_KHR = (VK_PRESENT_MODE_FIFO_RELAXED_KHR - VK_PRESENT_MODE_IMMEDIATE_KHR + 1), @@ -3299,6 +3537,11 @@ VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkSwapchainKHR) #define VK_KHR_SWAPCHAIN_SPEC_VERSION 68 #define VK_KHR_SWAPCHAIN_EXTENSION_NAME "VK_KHR_swapchain" + +typedef enum VkSwapchainCreateFlagBitsKHR { + VK_SWAPCHAIN_CREATE_BIND_SFR_BIT_KHX = 0x00000001, + VK_SWAPCHAIN_CREATE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkSwapchainCreateFlagBitsKHR; typedef VkFlags VkSwapchainCreateFlagsKHR; typedef struct VkSwapchainCreateInfoKHR { @@ -3599,7 +3842,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceXcbPresentationSupportKHR( #define VK_KHR_wayland_surface 1 #include -#define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 5 +#define VK_KHR_WAYLAND_SURFACE_SPEC_VERSION 6 #define VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME "VK_KHR_wayland_surface" typedef VkFlags VkWaylandSurfaceCreateFlagsKHR; @@ -3697,7 +3940,7 @@ VKAPI_ATTR VkResult VKAPI_CALL vkCreateAndroidSurfaceKHR( #define VK_KHR_win32_surface 1 #include -#define VK_KHR_WIN32_SURFACE_SPEC_VERSION 5 +#define VK_KHR_WIN32_SURFACE_SPEC_VERSION 6 #define VK_KHR_WIN32_SURFACE_EXTENSION_NAME "VK_KHR_win32_surface" typedef VkFlags VkWin32SurfaceCreateFlagsKHR; @@ -3732,426 +3975,2480 @@ VKAPI_ATTR VkBool32 VKAPI_CALL vkGetPhysicalDeviceWin32PresentationSupportKHR( #define VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME "VK_KHR_sampler_mirror_clamp_to_edge" -#define VK_EXT_debug_report 1 -VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT) - -#define VK_EXT_DEBUG_REPORT_SPEC_VERSION 3 -#define VK_EXT_DEBUG_REPORT_EXTENSION_NAME "VK_EXT_debug_report" -#define VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT +#define VK_KHR_get_physical_device_properties2 1 +#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_SPEC_VERSION 1 +#define VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME "VK_KHR_get_physical_device_properties2" +typedef struct VkPhysicalDeviceFeatures2KHR { + VkStructureType sType; + void* pNext; + VkPhysicalDeviceFeatures features; +} VkPhysicalDeviceFeatures2KHR; -typedef enum VkDebugReportObjectTypeEXT { - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT = 0, - VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT = 1, - VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT = 2, - VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT = 3, - VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT = 4, - VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT = 5, - VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT = 6, - VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT = 7, - VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT = 8, - VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT = 9, - VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT = 10, - VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT = 11, - VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT = 12, - VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT = 13, - VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT = 14, - VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT = 15, - VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT = 16, - VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT = 17, - VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT = 18, - VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT = 19, - VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT = 20, - VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT = 21, - VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT = 22, - VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT = 23, - VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT = 24, - VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT = 25, - VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT = 26, - VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT = 27, - VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT = 28, - VK_DEBUG_REPORT_OBJECT_TYPE_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT, - VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT, - VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1), - VK_DEBUG_REPORT_OBJECT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF -} VkDebugReportObjectTypeEXT; +typedef struct VkPhysicalDeviceProperties2KHR { + VkStructureType sType; + void* pNext; + VkPhysicalDeviceProperties properties; +} VkPhysicalDeviceProperties2KHR; -typedef enum VkDebugReportErrorEXT { - VK_DEBUG_REPORT_ERROR_NONE_EXT = 0, - VK_DEBUG_REPORT_ERROR_CALLBACK_REF_EXT = 1, - VK_DEBUG_REPORT_ERROR_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_ERROR_NONE_EXT, - VK_DEBUG_REPORT_ERROR_END_RANGE_EXT = VK_DEBUG_REPORT_ERROR_CALLBACK_REF_EXT, - VK_DEBUG_REPORT_ERROR_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_ERROR_CALLBACK_REF_EXT - VK_DEBUG_REPORT_ERROR_NONE_EXT + 1), - VK_DEBUG_REPORT_ERROR_MAX_ENUM_EXT = 0x7FFFFFFF -} VkDebugReportErrorEXT; +typedef struct VkFormatProperties2KHR { + VkStructureType sType; + void* pNext; + VkFormatProperties formatProperties; +} VkFormatProperties2KHR; +typedef struct VkImageFormatProperties2KHR { + VkStructureType sType; + void* pNext; + VkImageFormatProperties imageFormatProperties; +} VkImageFormatProperties2KHR; -typedef enum VkDebugReportFlagBitsEXT { - VK_DEBUG_REPORT_INFORMATION_BIT_EXT = 0x00000001, - VK_DEBUG_REPORT_WARNING_BIT_EXT = 0x00000002, - VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT = 0x00000004, - VK_DEBUG_REPORT_ERROR_BIT_EXT = 0x00000008, - VK_DEBUG_REPORT_DEBUG_BIT_EXT = 0x00000010, - VK_DEBUG_REPORT_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF -} VkDebugReportFlagBitsEXT; -typedef VkFlags VkDebugReportFlagsEXT; +typedef struct VkPhysicalDeviceImageFormatInfo2KHR { + VkStructureType sType; + const void* pNext; + VkFormat format; + VkImageType type; + VkImageTiling tiling; + VkImageUsageFlags usage; + VkImageCreateFlags flags; +} VkPhysicalDeviceImageFormatInfo2KHR; + +typedef struct VkQueueFamilyProperties2KHR { + VkStructureType sType; + void* pNext; + VkQueueFamilyProperties queueFamilyProperties; +} VkQueueFamilyProperties2KHR; -typedef VkBool32 (VKAPI_PTR *PFN_vkDebugReportCallbackEXT)( - VkDebugReportFlagsEXT flags, - VkDebugReportObjectTypeEXT objectType, - uint64_t object, - size_t location, - int32_t messageCode, - const char* pLayerPrefix, - const char* pMessage, - void* pUserData); +typedef struct VkPhysicalDeviceMemoryProperties2KHR { + VkStructureType sType; + void* pNext; + VkPhysicalDeviceMemoryProperties memoryProperties; +} VkPhysicalDeviceMemoryProperties2KHR; +typedef struct VkSparseImageFormatProperties2KHR { + VkStructureType sType; + void* pNext; + VkSparseImageFormatProperties properties; +} VkSparseImageFormatProperties2KHR; -typedef struct VkDebugReportCallbackCreateInfoEXT { - VkStructureType sType; - const void* pNext; - VkDebugReportFlagsEXT flags; - PFN_vkDebugReportCallbackEXT pfnCallback; - void* pUserData; -} VkDebugReportCallbackCreateInfoEXT; +typedef struct VkPhysicalDeviceSparseImageFormatInfo2KHR { + VkStructureType sType; + const void* pNext; + VkFormat format; + VkImageType type; + VkSampleCountFlagBits samples; + VkImageUsageFlags usage; + VkImageTiling tiling; +} VkPhysicalDeviceSparseImageFormatInfo2KHR; -typedef VkResult (VKAPI_PTR *PFN_vkCreateDebugReportCallbackEXT)(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugReportCallbackEXT* pCallback); -typedef void (VKAPI_PTR *PFN_vkDestroyDebugReportCallbackEXT)(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks* pAllocator); -typedef void (VKAPI_PTR *PFN_vkDebugReportMessageEXT)(VkInstance instance, VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, uint64_t object, size_t location, int32_t messageCode, const char* pLayerPrefix, const char* pMessage); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceFeatures2KHR)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2KHR* pFeatures); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceProperties2KHR)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceProperties2KHR* pProperties); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceFormatProperties2KHR)(VkPhysicalDevice physicalDevice, VkFormat format, VkFormatProperties2KHR* pFormatProperties); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceImageFormatInfo2KHR* pImageFormatInfo, VkImageFormatProperties2KHR* pImageFormatProperties); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)(VkPhysicalDevice physicalDevice, uint32_t* pQueueFamilyPropertyCount, VkQueueFamilyProperties2KHR* pQueueFamilyProperties); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceMemoryProperties2KHR)(VkPhysicalDevice physicalDevice, VkPhysicalDeviceMemoryProperties2KHR* pMemoryProperties); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSparseImageFormatInfo2KHR* pFormatInfo, uint32_t* pPropertyCount, VkSparseImageFormatProperties2KHR* pProperties); #ifndef VK_NO_PROTOTYPES -VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT( - VkInstance instance, - const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkDebugReportCallbackEXT* pCallback); +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures2KHR* pFeatures); -VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT( - VkInstance instance, - VkDebugReportCallbackEXT callback, - const VkAllocationCallbacks* pAllocator); +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties2KHR* pProperties); -VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT( - VkInstance instance, - VkDebugReportFlagsEXT flags, - VkDebugReportObjectTypeEXT objectType, - uint64_t object, - size_t location, - int32_t messageCode, - const char* pLayerPrefix, - const char* pMessage); +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFormatProperties2KHR( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkFormatProperties2KHR* pFormatProperties); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceImageFormatProperties2KHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceImageFormatInfo2KHR* pImageFormatInfo, + VkImageFormatProperties2KHR* pImageFormatProperties); + +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties2KHR( + VkPhysicalDevice physicalDevice, + uint32_t* pQueueFamilyPropertyCount, + VkQueueFamilyProperties2KHR* pQueueFamilyProperties); + +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceMemoryProperties2KHR( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceMemoryProperties2KHR* pMemoryProperties); + +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceSparseImageFormatProperties2KHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSparseImageFormatInfo2KHR* pFormatInfo, + uint32_t* pPropertyCount, + VkSparseImageFormatProperties2KHR* pProperties); #endif -#define VK_NV_glsl_shader 1 -#define VK_NV_GLSL_SHADER_SPEC_VERSION 1 -#define VK_NV_GLSL_SHADER_EXTENSION_NAME "VK_NV_glsl_shader" +#define VK_KHR_shader_draw_parameters 1 +#define VK_KHR_SHADER_DRAW_PARAMETERS_SPEC_VERSION 1 +#define VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME "VK_KHR_shader_draw_parameters" -#define VK_IMG_filter_cubic 1 -#define VK_IMG_FILTER_CUBIC_SPEC_VERSION 1 -#define VK_IMG_FILTER_CUBIC_EXTENSION_NAME "VK_IMG_filter_cubic" +#define VK_KHR_maintenance1 1 +#define VK_KHR_MAINTENANCE1_SPEC_VERSION 1 +#define VK_KHR_MAINTENANCE1_EXTENSION_NAME "VK_KHR_maintenance1" +typedef VkFlags VkCommandPoolTrimFlagsKHR; -#define VK_AMD_rasterization_order 1 -#define VK_AMD_RASTERIZATION_ORDER_SPEC_VERSION 1 -#define VK_AMD_RASTERIZATION_ORDER_EXTENSION_NAME "VK_AMD_rasterization_order" +typedef void (VKAPI_PTR *PFN_vkTrimCommandPoolKHR)(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlagsKHR flags); +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkTrimCommandPoolKHR( + VkDevice device, + VkCommandPool commandPool, + VkCommandPoolTrimFlagsKHR flags); +#endif -typedef enum VkRasterizationOrderAMD { - VK_RASTERIZATION_ORDER_STRICT_AMD = 0, - VK_RASTERIZATION_ORDER_RELAXED_AMD = 1, - VK_RASTERIZATION_ORDER_BEGIN_RANGE_AMD = VK_RASTERIZATION_ORDER_STRICT_AMD, - VK_RASTERIZATION_ORDER_END_RANGE_AMD = VK_RASTERIZATION_ORDER_RELAXED_AMD, - VK_RASTERIZATION_ORDER_RANGE_SIZE_AMD = (VK_RASTERIZATION_ORDER_RELAXED_AMD - VK_RASTERIZATION_ORDER_STRICT_AMD + 1), - VK_RASTERIZATION_ORDER_MAX_ENUM_AMD = 0x7FFFFFFF -} VkRasterizationOrderAMD; +#define VK_KHR_external_memory_capabilities 1 +#define VK_LUID_SIZE_KHR 8 +#define VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME "VK_KHR_external_memory_capabilities" + + +typedef enum VkExternalMemoryHandleTypeFlagBitsKHR { + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR = 0x00000001, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR = 0x00000002, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR = 0x00000004, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_BIT_KHR = 0x00000008, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_TEXTURE_KMT_BIT_KHR = 0x00000010, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP_BIT_KHR = 0x00000020, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE_BIT_KHR = 0x00000040, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkExternalMemoryHandleTypeFlagBitsKHR; +typedef VkFlags VkExternalMemoryHandleTypeFlagsKHR; + +typedef enum VkExternalMemoryFeatureFlagBitsKHR { + VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_KHR = 0x00000001, + VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_KHR = 0x00000002, + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_KHR = 0x00000004, + VK_EXTERNAL_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkExternalMemoryFeatureFlagBitsKHR; +typedef VkFlags VkExternalMemoryFeatureFlagsKHR; + +typedef struct VkExternalMemoryPropertiesKHR { + VkExternalMemoryFeatureFlagsKHR externalMemoryFeatures; + VkExternalMemoryHandleTypeFlagsKHR exportFromImportedHandleTypes; + VkExternalMemoryHandleTypeFlagsKHR compatibleHandleTypes; +} VkExternalMemoryPropertiesKHR; + +typedef struct VkPhysicalDeviceExternalImageFormatInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagBitsKHR handleType; +} VkPhysicalDeviceExternalImageFormatInfoKHR; -typedef struct VkPipelineRasterizationStateRasterizationOrderAMD { - VkStructureType sType; - const void* pNext; - VkRasterizationOrderAMD rasterizationOrder; -} VkPipelineRasterizationStateRasterizationOrderAMD; +typedef struct VkExternalImageFormatPropertiesKHR { + VkStructureType sType; + void* pNext; + VkExternalMemoryPropertiesKHR externalMemoryProperties; +} VkExternalImageFormatPropertiesKHR; + +typedef struct VkPhysicalDeviceExternalBufferInfoKHR { + VkStructureType sType; + const void* pNext; + VkBufferCreateFlags flags; + VkBufferUsageFlags usage; + VkExternalMemoryHandleTypeFlagBitsKHR handleType; +} VkPhysicalDeviceExternalBufferInfoKHR; +typedef struct VkExternalBufferPropertiesKHR { + VkStructureType sType; + void* pNext; + VkExternalMemoryPropertiesKHR externalMemoryProperties; +} VkExternalBufferPropertiesKHR; +typedef struct VkPhysicalDeviceIDPropertiesKHR { + VkStructureType sType; + void* pNext; + uint8_t deviceUUID[VK_UUID_SIZE]; + uint8_t driverUUID[VK_UUID_SIZE]; + uint8_t deviceLUID[VK_LUID_SIZE_KHR]; + uint32_t deviceNodeMask; + VkBool32 deviceLUIDValid; +} VkPhysicalDeviceIDPropertiesKHR; -#define VK_AMD_shader_trinary_minmax 1 -#define VK_AMD_SHADER_TRINARY_MINMAX_SPEC_VERSION 1 -#define VK_AMD_SHADER_TRINARY_MINMAX_EXTENSION_NAME "VK_AMD_shader_trinary_minmax" +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalBufferInfoKHR* pExternalBufferInfo, VkExternalBufferPropertiesKHR* pExternalBufferProperties); -#define VK_AMD_shader_explicit_vertex_parameter 1 -#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_SPEC_VERSION 1 -#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_EXTENSION_NAME "VK_AMD_shader_explicit_vertex_parameter" +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalBufferPropertiesKHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalBufferInfoKHR* pExternalBufferInfo, + VkExternalBufferPropertiesKHR* pExternalBufferProperties); +#endif +#define VK_KHR_external_memory 1 +#define VK_KHR_EXTERNAL_MEMORY_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME "VK_KHR_external_memory" +#define VK_QUEUE_FAMILY_EXTERNAL_KHR (~0U-1) -#define VK_EXT_debug_marker 1 -#define VK_EXT_DEBUG_MARKER_SPEC_VERSION 3 -#define VK_EXT_DEBUG_MARKER_EXTENSION_NAME "VK_EXT_debug_marker" +typedef struct VkExternalMemoryImageCreateInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagsKHR handleTypes; +} VkExternalMemoryImageCreateInfoKHR; -typedef struct VkDebugMarkerObjectNameInfoEXT { - VkStructureType sType; - const void* pNext; - VkDebugReportObjectTypeEXT objectType; - uint64_t object; - const char* pObjectName; -} VkDebugMarkerObjectNameInfoEXT; +typedef struct VkExternalMemoryBufferCreateInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagsKHR handleTypes; +} VkExternalMemoryBufferCreateInfoKHR; -typedef struct VkDebugMarkerObjectTagInfoEXT { +typedef struct VkExportMemoryAllocateInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagsKHR handleTypes; +} VkExportMemoryAllocateInfoKHR; + + + +#ifdef VK_USE_PLATFORM_WIN32_KHR +#define VK_KHR_external_memory_win32 1 +#define VK_KHR_EXTERNAL_MEMORY_WIN32_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME "VK_KHR_external_memory_win32" + +typedef struct VkImportMemoryWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagBitsKHR handleType; + HANDLE handle; + LPCWSTR name; +} VkImportMemoryWin32HandleInfoKHR; + +typedef struct VkExportMemoryWin32HandleInfoKHR { VkStructureType sType; const void* pNext; - VkDebugReportObjectTypeEXT objectType; - uint64_t object; - uint64_t tagName; - size_t tagSize; - const void* pTag; -} VkDebugMarkerObjectTagInfoEXT; + const SECURITY_ATTRIBUTES* pAttributes; + DWORD dwAccess; + LPCWSTR name; +} VkExportMemoryWin32HandleInfoKHR; -typedef struct VkDebugMarkerMarkerInfoEXT { +typedef struct VkMemoryWin32HandlePropertiesKHR { VkStructureType sType; - const void* pNext; - const char* pMarkerName; - float color[4]; -} VkDebugMarkerMarkerInfoEXT; + void* pNext; + uint32_t memoryTypeBits; +} VkMemoryWin32HandlePropertiesKHR; +typedef struct VkMemoryGetWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + VkDeviceMemory memory; + VkExternalMemoryHandleTypeFlagBitsKHR handleType; +} VkMemoryGetWin32HandleInfoKHR; -typedef VkResult (VKAPI_PTR *PFN_vkDebugMarkerSetObjectTagEXT)(VkDevice device, VkDebugMarkerObjectTagInfoEXT* pTagInfo); -typedef VkResult (VKAPI_PTR *PFN_vkDebugMarkerSetObjectNameEXT)(VkDevice device, VkDebugMarkerObjectNameInfoEXT* pNameInfo); -typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerBeginEXT)(VkCommandBuffer commandBuffer, VkDebugMarkerMarkerInfoEXT* pMarkerInfo); -typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerEndEXT)(VkCommandBuffer commandBuffer); -typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerInsertEXT)(VkCommandBuffer commandBuffer, VkDebugMarkerMarkerInfoEXT* pMarkerInfo); + +typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryWin32HandleKHR)(VkDevice device, const VkMemoryGetWin32HandleInfoKHR* pGetWin32HandleInfo, HANDLE* pHandle); +typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryWin32HandlePropertiesKHR)(VkDevice device, VkExternalMemoryHandleTypeFlagBitsKHR handleType, HANDLE handle, VkMemoryWin32HandlePropertiesKHR* pMemoryWin32HandleProperties); #ifndef VK_NO_PROTOTYPES -VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectTagEXT( +VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryWin32HandleKHR( VkDevice device, - VkDebugMarkerObjectTagInfoEXT* pTagInfo); + const VkMemoryGetWin32HandleInfoKHR* pGetWin32HandleInfo, + HANDLE* pHandle); -VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectNameEXT( +VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryWin32HandlePropertiesKHR( VkDevice device, - VkDebugMarkerObjectNameInfoEXT* pNameInfo); + VkExternalMemoryHandleTypeFlagBitsKHR handleType, + HANDLE handle, + VkMemoryWin32HandlePropertiesKHR* pMemoryWin32HandleProperties); +#endif +#endif /* VK_USE_PLATFORM_WIN32_KHR */ -VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerBeginEXT( - VkCommandBuffer commandBuffer, - VkDebugMarkerMarkerInfoEXT* pMarkerInfo); +#define VK_KHR_external_memory_fd 1 +#define VK_KHR_EXTERNAL_MEMORY_FD_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME "VK_KHR_external_memory_fd" -VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerEndEXT( - VkCommandBuffer commandBuffer); +typedef struct VkImportMemoryFdInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagBitsKHR handleType; + int fd; +} VkImportMemoryFdInfoKHR; -VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerInsertEXT( - VkCommandBuffer commandBuffer, - VkDebugMarkerMarkerInfoEXT* pMarkerInfo); +typedef struct VkMemoryFdPropertiesKHR { + VkStructureType sType; + void* pNext; + uint32_t memoryTypeBits; +} VkMemoryFdPropertiesKHR; + +typedef struct VkMemoryGetFdInfoKHR { + VkStructureType sType; + const void* pNext; + VkDeviceMemory memory; + VkExternalMemoryHandleTypeFlagBitsKHR handleType; +} VkMemoryGetFdInfoKHR; + + +typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryFdKHR)(VkDevice device, const VkMemoryGetFdInfoKHR* pGetFdInfo, int* pFd); +typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryFdPropertiesKHR)(VkDevice device, VkExternalMemoryHandleTypeFlagBitsKHR handleType, int fd, VkMemoryFdPropertiesKHR* pMemoryFdProperties); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryFdKHR( + VkDevice device, + const VkMemoryGetFdInfoKHR* pGetFdInfo, + int* pFd); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryFdPropertiesKHR( + VkDevice device, + VkExternalMemoryHandleTypeFlagBitsKHR handleType, + int fd, + VkMemoryFdPropertiesKHR* pMemoryFdProperties); #endif -#define VK_AMD_gcn_shader 1 -#define VK_AMD_GCN_SHADER_SPEC_VERSION 1 -#define VK_AMD_GCN_SHADER_EXTENSION_NAME "VK_AMD_gcn_shader" +#ifdef VK_USE_PLATFORM_WIN32_KHR +#define VK_KHR_win32_keyed_mutex 1 +#define VK_KHR_WIN32_KEYED_MUTEX_SPEC_VERSION 1 +#define VK_KHR_WIN32_KEYED_MUTEX_EXTENSION_NAME "VK_KHR_win32_keyed_mutex" +typedef struct VkWin32KeyedMutexAcquireReleaseInfoKHR { + VkStructureType sType; + const void* pNext; + uint32_t acquireCount; + const VkDeviceMemory* pAcquireSyncs; + const uint64_t* pAcquireKeys; + const uint32_t* pAcquireTimeouts; + uint32_t releaseCount; + const VkDeviceMemory* pReleaseSyncs; + const uint64_t* pReleaseKeys; +} VkWin32KeyedMutexAcquireReleaseInfoKHR; -#define VK_NV_dedicated_allocation 1 -#define VK_NV_DEDICATED_ALLOCATION_SPEC_VERSION 1 -#define VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_NV_dedicated_allocation" -typedef struct VkDedicatedAllocationImageCreateInfoNV { - VkStructureType sType; - const void* pNext; - VkBool32 dedicatedAllocation; -} VkDedicatedAllocationImageCreateInfoNV; +#endif /* VK_USE_PLATFORM_WIN32_KHR */ -typedef struct VkDedicatedAllocationBufferCreateInfoNV { - VkStructureType sType; - const void* pNext; - VkBool32 dedicatedAllocation; -} VkDedicatedAllocationBufferCreateInfoNV; +#define VK_KHR_external_semaphore_capabilities 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME "VK_KHR_external_semaphore_capabilities" + + +typedef enum VkExternalSemaphoreHandleTypeFlagBitsKHR { + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR = 0x00000001, + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR = 0x00000002, + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR = 0x00000004, + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE_BIT_KHR = 0x00000008, + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT_KHR = 0x00000010, + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkExternalSemaphoreHandleTypeFlagBitsKHR; +typedef VkFlags VkExternalSemaphoreHandleTypeFlagsKHR; + +typedef enum VkExternalSemaphoreFeatureFlagBitsKHR { + VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT_KHR = 0x00000001, + VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT_KHR = 0x00000002, + VK_EXTERNAL_SEMAPHORE_FEATURE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkExternalSemaphoreFeatureFlagBitsKHR; +typedef VkFlags VkExternalSemaphoreFeatureFlagsKHR; + +typedef struct VkPhysicalDeviceExternalSemaphoreInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalSemaphoreHandleTypeFlagBitsKHR handleType; +} VkPhysicalDeviceExternalSemaphoreInfoKHR; -typedef struct VkDedicatedAllocationMemoryAllocateInfoNV { +typedef struct VkExternalSemaphorePropertiesKHR { + VkStructureType sType; + void* pNext; + VkExternalSemaphoreHandleTypeFlagsKHR exportFromImportedHandleTypes; + VkExternalSemaphoreHandleTypeFlagsKHR compatibleHandleTypes; + VkExternalSemaphoreFeatureFlagsKHR externalSemaphoreFeatures; +} VkExternalSemaphorePropertiesKHR; + + +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalSemaphorePropertiesKHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo, VkExternalSemaphorePropertiesKHR* pExternalSemaphoreProperties); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalSemaphorePropertiesKHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo, + VkExternalSemaphorePropertiesKHR* pExternalSemaphoreProperties); +#endif + +#define VK_KHR_external_semaphore 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME "VK_KHR_external_semaphore" + + +typedef enum VkSemaphoreImportFlagBitsKHR { + VK_SEMAPHORE_IMPORT_TEMPORARY_BIT_KHR = 0x00000001, + VK_SEMAPHORE_IMPORT_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkSemaphoreImportFlagBitsKHR; +typedef VkFlags VkSemaphoreImportFlagsKHR; + +typedef struct VkExportSemaphoreCreateInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalSemaphoreHandleTypeFlagsKHR handleTypes; +} VkExportSemaphoreCreateInfoKHR; + + + +#ifdef VK_USE_PLATFORM_WIN32_KHR +#define VK_KHR_external_semaphore_win32 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_WIN32_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME "VK_KHR_external_semaphore_win32" + +typedef struct VkImportSemaphoreWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + VkSemaphore semaphore; + VkSemaphoreImportFlagsKHR flags; + VkExternalSemaphoreHandleTypeFlagBitsKHR handleType; + HANDLE handle; + LPCWSTR name; +} VkImportSemaphoreWin32HandleInfoKHR; + +typedef struct VkExportSemaphoreWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + const SECURITY_ATTRIBUTES* pAttributes; + DWORD dwAccess; + LPCWSTR name; +} VkExportSemaphoreWin32HandleInfoKHR; + +typedef struct VkD3D12FenceSubmitInfoKHR { VkStructureType sType; const void* pNext; - VkImage image; - VkBuffer buffer; -} VkDedicatedAllocationMemoryAllocateInfoNV; + uint32_t waitSemaphoreValuesCount; + const uint64_t* pWaitSemaphoreValues; + uint32_t signalSemaphoreValuesCount; + const uint64_t* pSignalSemaphoreValues; +} VkD3D12FenceSubmitInfoKHR; + +typedef struct VkSemaphoreGetWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + VkSemaphore semaphore; + VkExternalSemaphoreHandleTypeFlagBitsKHR handleType; +} VkSemaphoreGetWin32HandleInfoKHR; +typedef VkResult (VKAPI_PTR *PFN_vkImportSemaphoreWin32HandleKHR)(VkDevice device, const VkImportSemaphoreWin32HandleInfoKHR* pImportSemaphoreWin32HandleInfo); +typedef VkResult (VKAPI_PTR *PFN_vkGetSemaphoreWin32HandleKHR)(VkDevice device, const VkSemaphoreGetWin32HandleInfoKHR* pGetWin32HandleInfo, HANDLE* pHandle); -#define VK_AMD_draw_indirect_count 1 -#define VK_AMD_DRAW_INDIRECT_COUNT_SPEC_VERSION 1 -#define VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME "VK_AMD_draw_indirect_count" +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkImportSemaphoreWin32HandleKHR( + VkDevice device, + const VkImportSemaphoreWin32HandleInfoKHR* pImportSemaphoreWin32HandleInfo); -typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); -typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); +VKAPI_ATTR VkResult VKAPI_CALL vkGetSemaphoreWin32HandleKHR( + VkDevice device, + const VkSemaphoreGetWin32HandleInfoKHR* pGetWin32HandleInfo, + HANDLE* pHandle); +#endif +#endif /* VK_USE_PLATFORM_WIN32_KHR */ + +#define VK_KHR_external_semaphore_fd 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_FD_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME "VK_KHR_external_semaphore_fd" + +typedef struct VkImportSemaphoreFdInfoKHR { + VkStructureType sType; + const void* pNext; + VkSemaphore semaphore; + VkSemaphoreImportFlagsKHR flags; + VkExternalSemaphoreHandleTypeFlagBitsKHR handleType; + int fd; +} VkImportSemaphoreFdInfoKHR; + +typedef struct VkSemaphoreGetFdInfoKHR { + VkStructureType sType; + const void* pNext; + VkSemaphore semaphore; + VkExternalSemaphoreHandleTypeFlagBitsKHR handleType; +} VkSemaphoreGetFdInfoKHR; + + +typedef VkResult (VKAPI_PTR *PFN_vkImportSemaphoreFdKHR)(VkDevice device, const VkImportSemaphoreFdInfoKHR* pImportSemaphoreFdInfo); +typedef VkResult (VKAPI_PTR *PFN_vkGetSemaphoreFdKHR)(VkDevice device, const VkSemaphoreGetFdInfoKHR* pGetFdInfo, int* pFd); #ifndef VK_NO_PROTOTYPES -VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCountAMD( - VkCommandBuffer commandBuffer, - VkBuffer buffer, - VkDeviceSize offset, - VkBuffer countBuffer, - VkDeviceSize countBufferOffset, - uint32_t maxDrawCount, - uint32_t stride); +VKAPI_ATTR VkResult VKAPI_CALL vkImportSemaphoreFdKHR( + VkDevice device, + const VkImportSemaphoreFdInfoKHR* pImportSemaphoreFdInfo); -VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCountAMD( +VKAPI_ATTR VkResult VKAPI_CALL vkGetSemaphoreFdKHR( + VkDevice device, + const VkSemaphoreGetFdInfoKHR* pGetFdInfo, + int* pFd); +#endif + +#define VK_KHR_push_descriptor 1 +#define VK_KHR_PUSH_DESCRIPTOR_SPEC_VERSION 1 +#define VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME "VK_KHR_push_descriptor" + +typedef struct VkPhysicalDevicePushDescriptorPropertiesKHR { + VkStructureType sType; + void* pNext; + uint32_t maxPushDescriptors; +} VkPhysicalDevicePushDescriptorPropertiesKHR; + + +typedef void (VKAPI_PTR *PFN_vkCmdPushDescriptorSetKHR)(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout layout, uint32_t set, uint32_t descriptorWriteCount, const VkWriteDescriptorSet* pDescriptorWrites); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdPushDescriptorSetKHR( VkCommandBuffer commandBuffer, - VkBuffer buffer, - VkDeviceSize offset, - VkBuffer countBuffer, - VkDeviceSize countBufferOffset, - uint32_t maxDrawCount, - uint32_t stride); + VkPipelineBindPoint pipelineBindPoint, + VkPipelineLayout layout, + uint32_t set, + uint32_t descriptorWriteCount, + const VkWriteDescriptorSet* pDescriptorWrites); #endif -#define VK_AMD_negative_viewport_height 1 -#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_SPEC_VERSION 0 -#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_EXTENSION_NAME "VK_AMD_negative_viewport_height" +#define VK_KHR_16bit_storage 1 +#define VK_KHR_16BIT_STORAGE_SPEC_VERSION 1 +#define VK_KHR_16BIT_STORAGE_EXTENSION_NAME "VK_KHR_16bit_storage" +typedef struct VkPhysicalDevice16BitStorageFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 storageBuffer16BitAccess; + VkBool32 uniformAndStorageBuffer16BitAccess; + VkBool32 storagePushConstant16; + VkBool32 storageInputOutput16; +} VkPhysicalDevice16BitStorageFeaturesKHR; -#define VK_AMD_gpu_shader_half_float 1 -#define VK_AMD_GPU_SHADER_HALF_FLOAT_SPEC_VERSION 1 -#define VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME "VK_AMD_gpu_shader_half_float" -#define VK_AMD_shader_ballot 1 -#define VK_AMD_SHADER_BALLOT_SPEC_VERSION 0 -#define VK_AMD_SHADER_BALLOT_EXTENSION_NAME "VK_AMD_shader_ballot" +#define VK_KHR_incremental_present 1 +#define VK_KHR_INCREMENTAL_PRESENT_SPEC_VERSION 1 +#define VK_KHR_INCREMENTAL_PRESENT_EXTENSION_NAME "VK_KHR_incremental_present" +typedef struct VkRectLayerKHR { + VkOffset2D offset; + VkExtent2D extent; + uint32_t layer; +} VkRectLayerKHR; -#define VK_IMG_format_pvrtc 1 -#define VK_IMG_FORMAT_PVRTC_SPEC_VERSION 1 -#define VK_IMG_FORMAT_PVRTC_EXTENSION_NAME "VK_IMG_format_pvrtc" +typedef struct VkPresentRegionKHR { + uint32_t rectangleCount; + const VkRectLayerKHR* pRectangles; +} VkPresentRegionKHR; +typedef struct VkPresentRegionsKHR { + VkStructureType sType; + const void* pNext; + uint32_t swapchainCount; + const VkPresentRegionKHR* pRegions; +} VkPresentRegionsKHR; -#define VK_NV_external_memory_capabilities 1 -#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_SPEC_VERSION 1 -#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME "VK_NV_external_memory_capabilities" -typedef enum VkExternalMemoryHandleTypeFlagBitsNV { - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_NV = 0x00000001, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_NV = 0x00000002, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_BIT_NV = 0x00000004, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_KMT_BIT_NV = 0x00000008, - VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF -} VkExternalMemoryHandleTypeFlagBitsNV; -typedef VkFlags VkExternalMemoryHandleTypeFlagsNV; +#define VK_KHR_descriptor_update_template 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDescriptorUpdateTemplateKHR) -typedef enum VkExternalMemoryFeatureFlagBitsNV { - VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_NV = 0x00000001, - VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_NV = 0x00000002, - VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_NV = 0x00000004, - VK_EXTERNAL_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF -} VkExternalMemoryFeatureFlagBitsNV; -typedef VkFlags VkExternalMemoryFeatureFlagsNV; +#define VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_SPEC_VERSION 1 +#define VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME "VK_KHR_descriptor_update_template" -typedef struct VkExternalImageFormatPropertiesNV { - VkImageFormatProperties imageFormatProperties; - VkExternalMemoryFeatureFlagsNV externalMemoryFeatures; - VkExternalMemoryHandleTypeFlagsNV exportFromImportedHandleTypes; - VkExternalMemoryHandleTypeFlagsNV compatibleHandleTypes; -} VkExternalImageFormatPropertiesNV; +typedef enum VkDescriptorUpdateTemplateTypeKHR { + VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR = 0, + VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR = 1, + VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_BEGIN_RANGE_KHR = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR, + VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_END_RANGE_KHR = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR, + VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_RANGE_SIZE_KHR = (VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR - VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR + 1), + VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkDescriptorUpdateTemplateTypeKHR; -typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalImageFormatPropertiesNV)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkExternalMemoryHandleTypeFlagsNV externalHandleType, VkExternalImageFormatPropertiesNV* pExternalImageFormatProperties); +typedef VkFlags VkDescriptorUpdateTemplateCreateFlagsKHR; + +typedef struct VkDescriptorUpdateTemplateEntryKHR { + uint32_t dstBinding; + uint32_t dstArrayElement; + uint32_t descriptorCount; + VkDescriptorType descriptorType; + size_t offset; + size_t stride; +} VkDescriptorUpdateTemplateEntryKHR; + +typedef struct VkDescriptorUpdateTemplateCreateInfoKHR { + VkStructureType sType; + void* pNext; + VkDescriptorUpdateTemplateCreateFlagsKHR flags; + uint32_t descriptorUpdateEntryCount; + const VkDescriptorUpdateTemplateEntryKHR* pDescriptorUpdateEntries; + VkDescriptorUpdateTemplateTypeKHR templateType; + VkDescriptorSetLayout descriptorSetLayout; + VkPipelineBindPoint pipelineBindPoint; + VkPipelineLayout pipelineLayout; + uint32_t set; +} VkDescriptorUpdateTemplateCreateInfoKHR; + + +typedef VkResult (VKAPI_PTR *PFN_vkCreateDescriptorUpdateTemplateKHR)(VkDevice device, const VkDescriptorUpdateTemplateCreateInfoKHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDescriptorUpdateTemplateKHR* pDescriptorUpdateTemplate); +typedef void (VKAPI_PTR *PFN_vkDestroyDescriptorUpdateTemplateKHR)(VkDevice device, VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, const VkAllocationCallbacks* pAllocator); +typedef void (VKAPI_PTR *PFN_vkUpdateDescriptorSetWithTemplateKHR)(VkDevice device, VkDescriptorSet descriptorSet, VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, const void* pData); +typedef void (VKAPI_PTR *PFN_vkCmdPushDescriptorSetWithTemplateKHR)(VkCommandBuffer commandBuffer, VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, VkPipelineLayout layout, uint32_t set, const void* pData); #ifndef VK_NO_PROTOTYPES -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceExternalImageFormatPropertiesNV( - VkPhysicalDevice physicalDevice, - VkFormat format, - VkImageType type, - VkImageTiling tiling, - VkImageUsageFlags usage, - VkImageCreateFlags flags, - VkExternalMemoryHandleTypeFlagsNV externalHandleType, - VkExternalImageFormatPropertiesNV* pExternalImageFormatProperties); +VKAPI_ATTR VkResult VKAPI_CALL vkCreateDescriptorUpdateTemplateKHR( + VkDevice device, + const VkDescriptorUpdateTemplateCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDescriptorUpdateTemplateKHR* pDescriptorUpdateTemplate); + +VKAPI_ATTR void VKAPI_CALL vkDestroyDescriptorUpdateTemplateKHR( + VkDevice device, + VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkUpdateDescriptorSetWithTemplateKHR( + VkDevice device, + VkDescriptorSet descriptorSet, + VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, + const void* pData); + +VKAPI_ATTR void VKAPI_CALL vkCmdPushDescriptorSetWithTemplateKHR( + VkCommandBuffer commandBuffer, + VkDescriptorUpdateTemplateKHR descriptorUpdateTemplate, + VkPipelineLayout layout, + uint32_t set, + const void* pData); #endif -#define VK_NV_external_memory 1 -#define VK_NV_EXTERNAL_MEMORY_SPEC_VERSION 1 -#define VK_NV_EXTERNAL_MEMORY_EXTENSION_NAME "VK_NV_external_memory" +#define VK_KHR_shared_presentable_image 1 +#define VK_KHR_SHARED_PRESENTABLE_IMAGE_SPEC_VERSION 1 +#define VK_KHR_SHARED_PRESENTABLE_IMAGE_EXTENSION_NAME "VK_KHR_shared_presentable_image" -typedef struct VkExternalMemoryImageCreateInfoNV { - VkStructureType sType; - const void* pNext; - VkExternalMemoryHandleTypeFlagsNV handleTypes; -} VkExternalMemoryImageCreateInfoNV; +typedef struct VkSharedPresentSurfaceCapabilitiesKHR { + VkStructureType sType; + void* pNext; + VkImageUsageFlags sharedPresentSupportedUsageFlags; +} VkSharedPresentSurfaceCapabilitiesKHR; -typedef struct VkExportMemoryAllocateInfoNV { + +typedef VkResult (VKAPI_PTR *PFN_vkGetSwapchainStatusKHR)(VkDevice device, VkSwapchainKHR swapchain); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainStatusKHR( + VkDevice device, + VkSwapchainKHR swapchain); +#endif + +#define VK_KHR_external_fence_capabilities 1 +#define VK_KHR_EXTERNAL_FENCE_CAPABILITIES_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME "VK_KHR_external_fence_capabilities" + + +typedef enum VkExternalFenceHandleTypeFlagBitsKHR { + VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR = 0x00000001, + VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR = 0x00000002, + VK_EXTERNAL_FENCE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_KHR = 0x00000004, + VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT_KHR = 0x00000008, + VK_EXTERNAL_FENCE_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkExternalFenceHandleTypeFlagBitsKHR; +typedef VkFlags VkExternalFenceHandleTypeFlagsKHR; + +typedef enum VkExternalFenceFeatureFlagBitsKHR { + VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT_KHR = 0x00000001, + VK_EXTERNAL_FENCE_FEATURE_IMPORTABLE_BIT_KHR = 0x00000002, + VK_EXTERNAL_FENCE_FEATURE_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkExternalFenceFeatureFlagBitsKHR; +typedef VkFlags VkExternalFenceFeatureFlagsKHR; + +typedef struct VkPhysicalDeviceExternalFenceInfoKHR { + VkStructureType sType; + const void* pNext; + VkExternalFenceHandleTypeFlagBitsKHR handleType; +} VkPhysicalDeviceExternalFenceInfoKHR; + +typedef struct VkExternalFencePropertiesKHR { VkStructureType sType; - const void* pNext; - VkExternalMemoryHandleTypeFlagsNV handleTypes; -} VkExportMemoryAllocateInfoNV; + void* pNext; + VkExternalFenceHandleTypeFlagsKHR exportFromImportedHandleTypes; + VkExternalFenceHandleTypeFlagsKHR compatibleHandleTypes; + VkExternalFenceFeatureFlagsKHR externalFenceFeatures; +} VkExternalFencePropertiesKHR; +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalFencePropertiesKHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceExternalFenceInfoKHR* pExternalFenceInfo, VkExternalFencePropertiesKHR* pExternalFenceProperties); -#ifdef VK_USE_PLATFORM_WIN32_KHR -#define VK_NV_external_memory_win32 1 -#define VK_NV_EXTERNAL_MEMORY_WIN32_SPEC_VERSION 1 -#define VK_NV_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME "VK_NV_external_memory_win32" +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceExternalFencePropertiesKHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceExternalFenceInfoKHR* pExternalFenceInfo, + VkExternalFencePropertiesKHR* pExternalFenceProperties); +#endif -typedef struct VkImportMemoryWin32HandleInfoNV { +#define VK_KHR_external_fence 1 +#define VK_KHR_EXTERNAL_FENCE_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_FENCE_EXTENSION_NAME "VK_KHR_external_fence" + + +typedef enum VkFenceImportFlagBitsKHR { + VK_FENCE_IMPORT_TEMPORARY_BIT_KHR = 0x00000001, + VK_FENCE_IMPORT_FLAG_BITS_MAX_ENUM_KHR = 0x7FFFFFFF +} VkFenceImportFlagBitsKHR; +typedef VkFlags VkFenceImportFlagsKHR; + +typedef struct VkExportFenceCreateInfoKHR { VkStructureType sType; const void* pNext; - VkExternalMemoryHandleTypeFlagsNV handleType; - HANDLE handle; -} VkImportMemoryWin32HandleInfoNV; + VkExternalFenceHandleTypeFlagsKHR handleTypes; +} VkExportFenceCreateInfoKHR; -typedef struct VkExportMemoryWin32HandleInfoNV { + + +#ifdef VK_USE_PLATFORM_WIN32_KHR +#define VK_KHR_external_fence_win32 1 +#define VK_KHR_EXTERNAL_FENCE_WIN32_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_FENCE_WIN32_EXTENSION_NAME "VK_KHR_external_fence_win32" + +typedef struct VkImportFenceWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + VkFence fence; + VkFenceImportFlagsKHR flags; + VkExternalFenceHandleTypeFlagBitsKHR handleType; + HANDLE handle; + LPCWSTR name; +} VkImportFenceWin32HandleInfoKHR; + +typedef struct VkExportFenceWin32HandleInfoKHR { VkStructureType sType; const void* pNext; const SECURITY_ATTRIBUTES* pAttributes; DWORD dwAccess; -} VkExportMemoryWin32HandleInfoNV; + LPCWSTR name; +} VkExportFenceWin32HandleInfoKHR; +typedef struct VkFenceGetWin32HandleInfoKHR { + VkStructureType sType; + const void* pNext; + VkFence fence; + VkExternalFenceHandleTypeFlagBitsKHR handleType; +} VkFenceGetWin32HandleInfoKHR; -typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryWin32HandleNV)(VkDevice device, VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagsNV handleType, HANDLE* pHandle); + +typedef VkResult (VKAPI_PTR *PFN_vkImportFenceWin32HandleKHR)(VkDevice device, const VkImportFenceWin32HandleInfoKHR* pImportFenceWin32HandleInfo); +typedef VkResult (VKAPI_PTR *PFN_vkGetFenceWin32HandleKHR)(VkDevice device, const VkFenceGetWin32HandleInfoKHR* pGetWin32HandleInfo, HANDLE* pHandle); #ifndef VK_NO_PROTOTYPES -VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryWin32HandleNV( +VKAPI_ATTR VkResult VKAPI_CALL vkImportFenceWin32HandleKHR( VkDevice device, - VkDeviceMemory memory, - VkExternalMemoryHandleTypeFlagsNV handleType, + const VkImportFenceWin32HandleInfoKHR* pImportFenceWin32HandleInfo); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetFenceWin32HandleKHR( + VkDevice device, + const VkFenceGetWin32HandleInfoKHR* pGetWin32HandleInfo, HANDLE* pHandle); #endif #endif /* VK_USE_PLATFORM_WIN32_KHR */ -#ifdef VK_USE_PLATFORM_WIN32_KHR -#define VK_NV_win32_keyed_mutex 1 -#define VK_NV_WIN32_KEYED_MUTEX_SPEC_VERSION 1 -#define VK_NV_WIN32_KEYED_MUTEX_EXTENSION_NAME "VK_NV_win32_keyed_mutex" +#define VK_KHR_external_fence_fd 1 +#define VK_KHR_EXTERNAL_FENCE_FD_SPEC_VERSION 1 +#define VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME "VK_KHR_external_fence_fd" -typedef struct VkWin32KeyedMutexAcquireReleaseInfoNV { - VkStructureType sType; - const void* pNext; - uint32_t acquireCount; - const VkDeviceMemory* pAcquireSyncs; - const uint64_t* pAcquireKeys; - const uint32_t* pAcquireTimeoutMilliseconds; - uint32_t releaseCount; - const VkDeviceMemory* pReleaseSyncs; - const uint64_t* pReleaseKeys; -} VkWin32KeyedMutexAcquireReleaseInfoNV; +typedef struct VkImportFenceFdInfoKHR { + VkStructureType sType; + const void* pNext; + VkFence fence; + VkFenceImportFlagsKHR flags; + VkExternalFenceHandleTypeFlagBitsKHR handleType; + int fd; +} VkImportFenceFdInfoKHR; +typedef struct VkFenceGetFdInfoKHR { + VkStructureType sType; + const void* pNext; + VkFence fence; + VkExternalFenceHandleTypeFlagBitsKHR handleType; +} VkFenceGetFdInfoKHR; -#endif /* VK_USE_PLATFORM_WIN32_KHR */ -#define VK_EXT_validation_flags 1 -#define VK_EXT_VALIDATION_FLAGS_SPEC_VERSION 1 -#define VK_EXT_VALIDATION_FLAGS_EXTENSION_NAME "VK_EXT_validation_flags" +typedef VkResult (VKAPI_PTR *PFN_vkImportFenceFdKHR)(VkDevice device, const VkImportFenceFdInfoKHR* pImportFenceFdInfo); +typedef VkResult (VKAPI_PTR *PFN_vkGetFenceFdKHR)(VkDevice device, const VkFenceGetFdInfoKHR* pGetFdInfo, int* pFd); +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkImportFenceFdKHR( + VkDevice device, + const VkImportFenceFdInfoKHR* pImportFenceFdInfo); -typedef enum VkValidationCheckEXT { - VK_VALIDATION_CHECK_ALL_EXT = 0, - VK_VALIDATION_CHECK_BEGIN_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT, - VK_VALIDATION_CHECK_END_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT, - VK_VALIDATION_CHECK_RANGE_SIZE_EXT = (VK_VALIDATION_CHECK_ALL_EXT - VK_VALIDATION_CHECK_ALL_EXT + 1), - VK_VALIDATION_CHECK_MAX_ENUM_EXT = 0x7FFFFFFF -} VkValidationCheckEXT; +VKAPI_ATTR VkResult VKAPI_CALL vkGetFenceFdKHR( + VkDevice device, + const VkFenceGetFdInfoKHR* pGetFdInfo, + int* pFd); +#endif + +#define VK_KHR_get_surface_capabilities2 1 +#define VK_KHR_GET_SURFACE_CAPABILITIES_2_SPEC_VERSION 1 +#define VK_KHR_GET_SURFACE_CAPABILITIES_2_EXTENSION_NAME "VK_KHR_get_surface_capabilities2" + +typedef struct VkPhysicalDeviceSurfaceInfo2KHR { + VkStructureType sType; + const void* pNext; + VkSurfaceKHR surface; +} VkPhysicalDeviceSurfaceInfo2KHR; + +typedef struct VkSurfaceCapabilities2KHR { + VkStructureType sType; + void* pNext; + VkSurfaceCapabilitiesKHR surfaceCapabilities; +} VkSurfaceCapabilities2KHR; + +typedef struct VkSurfaceFormat2KHR { + VkStructureType sType; + void* pNext; + VkSurfaceFormatKHR surfaceFormat; +} VkSurfaceFormat2KHR; + + +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, VkSurfaceCapabilities2KHR* pSurfaceCapabilities); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)(VkPhysicalDevice physicalDevice, const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, uint32_t* pSurfaceFormatCount, VkSurfaceFormat2KHR* pSurfaceFormats); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilities2KHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, + VkSurfaceCapabilities2KHR* pSurfaceCapabilities); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormats2KHR( + VkPhysicalDevice physicalDevice, + const VkPhysicalDeviceSurfaceInfo2KHR* pSurfaceInfo, + uint32_t* pSurfaceFormatCount, + VkSurfaceFormat2KHR* pSurfaceFormats); +#endif + +#define VK_KHR_variable_pointers 1 +#define VK_KHR_VARIABLE_POINTERS_SPEC_VERSION 1 +#define VK_KHR_VARIABLE_POINTERS_EXTENSION_NAME "VK_KHR_variable_pointers" + +typedef struct VkPhysicalDeviceVariablePointerFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 variablePointersStorageBuffer; + VkBool32 variablePointers; +} VkPhysicalDeviceVariablePointerFeaturesKHR; + + + +#define VK_KHR_dedicated_allocation 1 +#define VK_KHR_DEDICATED_ALLOCATION_SPEC_VERSION 3 +#define VK_KHR_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_KHR_dedicated_allocation" + +typedef struct VkMemoryDedicatedRequirementsKHR { + VkStructureType sType; + void* pNext; + VkBool32 prefersDedicatedAllocation; + VkBool32 requiresDedicatedAllocation; +} VkMemoryDedicatedRequirementsKHR; + +typedef struct VkMemoryDedicatedAllocateInfoKHR { + VkStructureType sType; + const void* pNext; + VkImage image; + VkBuffer buffer; +} VkMemoryDedicatedAllocateInfoKHR; + + + +#define VK_KHR_storage_buffer_storage_class 1 +#define VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_SPEC_VERSION 1 +#define VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME "VK_KHR_storage_buffer_storage_class" + + +#define VK_KHR_relaxed_block_layout 1 +#define VK_KHR_RELAXED_BLOCK_LAYOUT_SPEC_VERSION 1 +#define VK_KHR_RELAXED_BLOCK_LAYOUT_EXTENSION_NAME "VK_KHR_relaxed_block_layout" + + +#define VK_KHR_get_memory_requirements2 1 +#define VK_KHR_GET_MEMORY_REQUIREMENTS_2_SPEC_VERSION 1 +#define VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME "VK_KHR_get_memory_requirements2" + +typedef struct VkBufferMemoryRequirementsInfo2KHR { + VkStructureType sType; + const void* pNext; + VkBuffer buffer; +} VkBufferMemoryRequirementsInfo2KHR; + +typedef struct VkImageMemoryRequirementsInfo2KHR { + VkStructureType sType; + const void* pNext; + VkImage image; +} VkImageMemoryRequirementsInfo2KHR; + +typedef struct VkImageSparseMemoryRequirementsInfo2KHR { + VkStructureType sType; + const void* pNext; + VkImage image; +} VkImageSparseMemoryRequirementsInfo2KHR; + +typedef struct VkMemoryRequirements2KHR { + VkStructureType sType; + void* pNext; + VkMemoryRequirements memoryRequirements; +} VkMemoryRequirements2KHR; + +typedef struct VkSparseImageMemoryRequirements2KHR { + VkStructureType sType; + void* pNext; + VkSparseImageMemoryRequirements memoryRequirements; +} VkSparseImageMemoryRequirements2KHR; + + +typedef void (VKAPI_PTR *PFN_vkGetImageMemoryRequirements2KHR)(VkDevice device, const VkImageMemoryRequirementsInfo2KHR* pInfo, VkMemoryRequirements2KHR* pMemoryRequirements); +typedef void (VKAPI_PTR *PFN_vkGetBufferMemoryRequirements2KHR)(VkDevice device, const VkBufferMemoryRequirementsInfo2KHR* pInfo, VkMemoryRequirements2KHR* pMemoryRequirements); +typedef void (VKAPI_PTR *PFN_vkGetImageSparseMemoryRequirements2KHR)(VkDevice device, const VkImageSparseMemoryRequirementsInfo2KHR* pInfo, uint32_t* pSparseMemoryRequirementCount, VkSparseImageMemoryRequirements2KHR* pSparseMemoryRequirements); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkGetImageMemoryRequirements2KHR( + VkDevice device, + const VkImageMemoryRequirementsInfo2KHR* pInfo, + VkMemoryRequirements2KHR* pMemoryRequirements); + +VKAPI_ATTR void VKAPI_CALL vkGetBufferMemoryRequirements2KHR( + VkDevice device, + const VkBufferMemoryRequirementsInfo2KHR* pInfo, + VkMemoryRequirements2KHR* pMemoryRequirements); + +VKAPI_ATTR void VKAPI_CALL vkGetImageSparseMemoryRequirements2KHR( + VkDevice device, + const VkImageSparseMemoryRequirementsInfo2KHR* pInfo, + uint32_t* pSparseMemoryRequirementCount, + VkSparseImageMemoryRequirements2KHR* pSparseMemoryRequirements); +#endif + +#define VK_EXT_debug_report 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkDebugReportCallbackEXT) + +#define VK_EXT_DEBUG_REPORT_SPEC_VERSION 8 +#define VK_EXT_DEBUG_REPORT_EXTENSION_NAME "VK_EXT_debug_report" +#define VK_STRUCTURE_TYPE_DEBUG_REPORT_CREATE_INFO_EXT VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT +#define VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT_EXT + + +typedef enum VkDebugReportObjectTypeEXT { + VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT = 0, + VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT = 1, + VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT = 2, + VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT = 3, + VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT = 4, + VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT = 5, + VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT = 6, + VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT = 7, + VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT = 8, + VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT = 9, + VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT = 10, + VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT = 11, + VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT = 12, + VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT = 13, + VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT = 14, + VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT = 15, + VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT = 16, + VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT = 17, + VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT = 18, + VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT = 19, + VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT = 20, + VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT = 21, + VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT = 22, + VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT = 23, + VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT = 24, + VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT = 25, + VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT = 26, + VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT = 27, + VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_CALLBACK_EXT_EXT = 28, + VK_DEBUG_REPORT_OBJECT_TYPE_DISPLAY_KHR_EXT = 29, + VK_DEBUG_REPORT_OBJECT_TYPE_DISPLAY_MODE_KHR_EXT = 30, + VK_DEBUG_REPORT_OBJECT_TYPE_OBJECT_TABLE_NVX_EXT = 31, + VK_DEBUG_REPORT_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX_EXT = 32, + VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_KHR_EXT = 1000085000, + VK_DEBUG_REPORT_OBJECT_TYPE_BEGIN_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT, + VK_DEBUG_REPORT_OBJECT_TYPE_END_RANGE_EXT = VK_DEBUG_REPORT_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX_EXT, + VK_DEBUG_REPORT_OBJECT_TYPE_RANGE_SIZE_EXT = (VK_DEBUG_REPORT_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NVX_EXT - VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT + 1), + VK_DEBUG_REPORT_OBJECT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkDebugReportObjectTypeEXT; + + +typedef enum VkDebugReportFlagBitsEXT { + VK_DEBUG_REPORT_INFORMATION_BIT_EXT = 0x00000001, + VK_DEBUG_REPORT_WARNING_BIT_EXT = 0x00000002, + VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT = 0x00000004, + VK_DEBUG_REPORT_ERROR_BIT_EXT = 0x00000008, + VK_DEBUG_REPORT_DEBUG_BIT_EXT = 0x00000010, + VK_DEBUG_REPORT_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF +} VkDebugReportFlagBitsEXT; +typedef VkFlags VkDebugReportFlagsEXT; + +typedef VkBool32 (VKAPI_PTR *PFN_vkDebugReportCallbackEXT)( + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage, + void* pUserData); + +typedef struct VkDebugReportCallbackCreateInfoEXT { + VkStructureType sType; + const void* pNext; + VkDebugReportFlagsEXT flags; + PFN_vkDebugReportCallbackEXT pfnCallback; + void* pUserData; +} VkDebugReportCallbackCreateInfoEXT; + + +typedef VkResult (VKAPI_PTR *PFN_vkCreateDebugReportCallbackEXT)(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugReportCallbackEXT* pCallback); +typedef void (VKAPI_PTR *PFN_vkDestroyDebugReportCallbackEXT)(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks* pAllocator); +typedef void (VKAPI_PTR *PFN_vkDebugReportMessageEXT)(VkInstance instance, VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, uint64_t object, size_t location, int32_t messageCode, const char* pLayerPrefix, const char* pMessage); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkCreateDebugReportCallbackEXT( + VkInstance instance, + const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDebugReportCallbackEXT* pCallback); + +VKAPI_ATTR void VKAPI_CALL vkDestroyDebugReportCallbackEXT( + VkInstance instance, + VkDebugReportCallbackEXT callback, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR void VKAPI_CALL vkDebugReportMessageEXT( + VkInstance instance, + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objectType, + uint64_t object, + size_t location, + int32_t messageCode, + const char* pLayerPrefix, + const char* pMessage); +#endif + +#define VK_NV_glsl_shader 1 +#define VK_NV_GLSL_SHADER_SPEC_VERSION 1 +#define VK_NV_GLSL_SHADER_EXTENSION_NAME "VK_NV_glsl_shader" + + +#define VK_EXT_depth_range_unrestricted 1 +#define VK_EXT_DEPTH_RANGE_UNRESTRICTED_SPEC_VERSION 1 +#define VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME "VK_EXT_depth_range_unrestricted" + + +#define VK_IMG_filter_cubic 1 +#define VK_IMG_FILTER_CUBIC_SPEC_VERSION 1 +#define VK_IMG_FILTER_CUBIC_EXTENSION_NAME "VK_IMG_filter_cubic" + + +#define VK_AMD_rasterization_order 1 +#define VK_AMD_RASTERIZATION_ORDER_SPEC_VERSION 1 +#define VK_AMD_RASTERIZATION_ORDER_EXTENSION_NAME "VK_AMD_rasterization_order" + + +typedef enum VkRasterizationOrderAMD { + VK_RASTERIZATION_ORDER_STRICT_AMD = 0, + VK_RASTERIZATION_ORDER_RELAXED_AMD = 1, + VK_RASTERIZATION_ORDER_BEGIN_RANGE_AMD = VK_RASTERIZATION_ORDER_STRICT_AMD, + VK_RASTERIZATION_ORDER_END_RANGE_AMD = VK_RASTERIZATION_ORDER_RELAXED_AMD, + VK_RASTERIZATION_ORDER_RANGE_SIZE_AMD = (VK_RASTERIZATION_ORDER_RELAXED_AMD - VK_RASTERIZATION_ORDER_STRICT_AMD + 1), + VK_RASTERIZATION_ORDER_MAX_ENUM_AMD = 0x7FFFFFFF +} VkRasterizationOrderAMD; + +typedef struct VkPipelineRasterizationStateRasterizationOrderAMD { + VkStructureType sType; + const void* pNext; + VkRasterizationOrderAMD rasterizationOrder; +} VkPipelineRasterizationStateRasterizationOrderAMD; + + + +#define VK_AMD_shader_trinary_minmax 1 +#define VK_AMD_SHADER_TRINARY_MINMAX_SPEC_VERSION 1 +#define VK_AMD_SHADER_TRINARY_MINMAX_EXTENSION_NAME "VK_AMD_shader_trinary_minmax" + + +#define VK_AMD_shader_explicit_vertex_parameter 1 +#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_SPEC_VERSION 1 +#define VK_AMD_SHADER_EXPLICIT_VERTEX_PARAMETER_EXTENSION_NAME "VK_AMD_shader_explicit_vertex_parameter" + + +#define VK_EXT_debug_marker 1 +#define VK_EXT_DEBUG_MARKER_SPEC_VERSION 4 +#define VK_EXT_DEBUG_MARKER_EXTENSION_NAME "VK_EXT_debug_marker" + +typedef struct VkDebugMarkerObjectNameInfoEXT { + VkStructureType sType; + const void* pNext; + VkDebugReportObjectTypeEXT objectType; + uint64_t object; + const char* pObjectName; +} VkDebugMarkerObjectNameInfoEXT; + +typedef struct VkDebugMarkerObjectTagInfoEXT { + VkStructureType sType; + const void* pNext; + VkDebugReportObjectTypeEXT objectType; + uint64_t object; + uint64_t tagName; + size_t tagSize; + const void* pTag; +} VkDebugMarkerObjectTagInfoEXT; + +typedef struct VkDebugMarkerMarkerInfoEXT { + VkStructureType sType; + const void* pNext; + const char* pMarkerName; + float color[4]; +} VkDebugMarkerMarkerInfoEXT; + + +typedef VkResult (VKAPI_PTR *PFN_vkDebugMarkerSetObjectTagEXT)(VkDevice device, const VkDebugMarkerObjectTagInfoEXT* pTagInfo); +typedef VkResult (VKAPI_PTR *PFN_vkDebugMarkerSetObjectNameEXT)(VkDevice device, const VkDebugMarkerObjectNameInfoEXT* pNameInfo); +typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerBeginEXT)(VkCommandBuffer commandBuffer, const VkDebugMarkerMarkerInfoEXT* pMarkerInfo); +typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerEndEXT)(VkCommandBuffer commandBuffer); +typedef void (VKAPI_PTR *PFN_vkCmdDebugMarkerInsertEXT)(VkCommandBuffer commandBuffer, const VkDebugMarkerMarkerInfoEXT* pMarkerInfo); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectTagEXT( + VkDevice device, + const VkDebugMarkerObjectTagInfoEXT* pTagInfo); + +VKAPI_ATTR VkResult VKAPI_CALL vkDebugMarkerSetObjectNameEXT( + VkDevice device, + const VkDebugMarkerObjectNameInfoEXT* pNameInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerBeginEXT( + VkCommandBuffer commandBuffer, + const VkDebugMarkerMarkerInfoEXT* pMarkerInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerEndEXT( + VkCommandBuffer commandBuffer); + +VKAPI_ATTR void VKAPI_CALL vkCmdDebugMarkerInsertEXT( + VkCommandBuffer commandBuffer, + const VkDebugMarkerMarkerInfoEXT* pMarkerInfo); +#endif + +#define VK_AMD_gcn_shader 1 +#define VK_AMD_GCN_SHADER_SPEC_VERSION 1 +#define VK_AMD_GCN_SHADER_EXTENSION_NAME "VK_AMD_gcn_shader" + + +#define VK_NV_dedicated_allocation 1 +#define VK_NV_DEDICATED_ALLOCATION_SPEC_VERSION 1 +#define VK_NV_DEDICATED_ALLOCATION_EXTENSION_NAME "VK_NV_dedicated_allocation" + +typedef struct VkDedicatedAllocationImageCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkBool32 dedicatedAllocation; +} VkDedicatedAllocationImageCreateInfoNV; + +typedef struct VkDedicatedAllocationBufferCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkBool32 dedicatedAllocation; +} VkDedicatedAllocationBufferCreateInfoNV; + +typedef struct VkDedicatedAllocationMemoryAllocateInfoNV { + VkStructureType sType; + const void* pNext; + VkImage image; + VkBuffer buffer; +} VkDedicatedAllocationMemoryAllocateInfoNV; + + + +#define VK_AMD_draw_indirect_count 1 +#define VK_AMD_DRAW_INDIRECT_COUNT_SPEC_VERSION 1 +#define VK_AMD_DRAW_INDIRECT_COUNT_EXTENSION_NAME "VK_AMD_draw_indirect_count" + +typedef void (VKAPI_PTR *PFN_vkCmdDrawIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); +typedef void (VKAPI_PTR *PFN_vkCmdDrawIndexedIndirectCountAMD)(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkBuffer countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndirectCountAMD( + VkCommandBuffer commandBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkBuffer countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride); + +VKAPI_ATTR void VKAPI_CALL vkCmdDrawIndexedIndirectCountAMD( + VkCommandBuffer commandBuffer, + VkBuffer buffer, + VkDeviceSize offset, + VkBuffer countBuffer, + VkDeviceSize countBufferOffset, + uint32_t maxDrawCount, + uint32_t stride); +#endif + +#define VK_AMD_negative_viewport_height 1 +#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_SPEC_VERSION 1 +#define VK_AMD_NEGATIVE_VIEWPORT_HEIGHT_EXTENSION_NAME "VK_AMD_negative_viewport_height" + + +#define VK_AMD_gpu_shader_half_float 1 +#define VK_AMD_GPU_SHADER_HALF_FLOAT_SPEC_VERSION 1 +#define VK_AMD_GPU_SHADER_HALF_FLOAT_EXTENSION_NAME "VK_AMD_gpu_shader_half_float" + + +#define VK_AMD_shader_ballot 1 +#define VK_AMD_SHADER_BALLOT_SPEC_VERSION 1 +#define VK_AMD_SHADER_BALLOT_EXTENSION_NAME "VK_AMD_shader_ballot" + + +#define VK_AMD_texture_gather_bias_lod 1 +#define VK_AMD_TEXTURE_GATHER_BIAS_LOD_SPEC_VERSION 1 +#define VK_AMD_TEXTURE_GATHER_BIAS_LOD_EXTENSION_NAME "VK_AMD_texture_gather_bias_lod" + +typedef struct VkTextureLODGatherFormatPropertiesAMD { + VkStructureType sType; + void* pNext; + VkBool32 supportsTextureGatherLODBiasAMD; +} VkTextureLODGatherFormatPropertiesAMD; + + + +#define VK_KHX_multiview 1 +#define VK_KHX_MULTIVIEW_SPEC_VERSION 1 +#define VK_KHX_MULTIVIEW_EXTENSION_NAME "VK_KHX_multiview" + +typedef struct VkRenderPassMultiviewCreateInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t subpassCount; + const uint32_t* pViewMasks; + uint32_t dependencyCount; + const int32_t* pViewOffsets; + uint32_t correlationMaskCount; + const uint32_t* pCorrelationMasks; +} VkRenderPassMultiviewCreateInfoKHX; + +typedef struct VkPhysicalDeviceMultiviewFeaturesKHX { + VkStructureType sType; + void* pNext; + VkBool32 multiview; + VkBool32 multiviewGeometryShader; + VkBool32 multiviewTessellationShader; +} VkPhysicalDeviceMultiviewFeaturesKHX; + +typedef struct VkPhysicalDeviceMultiviewPropertiesKHX { + VkStructureType sType; + void* pNext; + uint32_t maxMultiviewViewCount; + uint32_t maxMultiviewInstanceIndex; +} VkPhysicalDeviceMultiviewPropertiesKHX; + + + +#define VK_IMG_format_pvrtc 1 +#define VK_IMG_FORMAT_PVRTC_SPEC_VERSION 1 +#define VK_IMG_FORMAT_PVRTC_EXTENSION_NAME "VK_IMG_format_pvrtc" + + +#define VK_NV_external_memory_capabilities 1 +#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_SPEC_VERSION 1 +#define VK_NV_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME "VK_NV_external_memory_capabilities" + + +typedef enum VkExternalMemoryHandleTypeFlagBitsNV { + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_NV = 0x00000001, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT_NV = 0x00000002, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_BIT_NV = 0x00000004, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_IMAGE_KMT_BIT_NV = 0x00000008, + VK_EXTERNAL_MEMORY_HANDLE_TYPE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF +} VkExternalMemoryHandleTypeFlagBitsNV; +typedef VkFlags VkExternalMemoryHandleTypeFlagsNV; + +typedef enum VkExternalMemoryFeatureFlagBitsNV { + VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT_NV = 0x00000001, + VK_EXTERNAL_MEMORY_FEATURE_EXPORTABLE_BIT_NV = 0x00000002, + VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT_NV = 0x00000004, + VK_EXTERNAL_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_NV = 0x7FFFFFFF +} VkExternalMemoryFeatureFlagBitsNV; +typedef VkFlags VkExternalMemoryFeatureFlagsNV; + +typedef struct VkExternalImageFormatPropertiesNV { + VkImageFormatProperties imageFormatProperties; + VkExternalMemoryFeatureFlagsNV externalMemoryFeatures; + VkExternalMemoryHandleTypeFlagsNV exportFromImportedHandleTypes; + VkExternalMemoryHandleTypeFlagsNV compatibleHandleTypes; +} VkExternalImageFormatPropertiesNV; + + +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceExternalImageFormatPropertiesNV)(VkPhysicalDevice physicalDevice, VkFormat format, VkImageType type, VkImageTiling tiling, VkImageUsageFlags usage, VkImageCreateFlags flags, VkExternalMemoryHandleTypeFlagsNV externalHandleType, VkExternalImageFormatPropertiesNV* pExternalImageFormatProperties); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceExternalImageFormatPropertiesNV( + VkPhysicalDevice physicalDevice, + VkFormat format, + VkImageType type, + VkImageTiling tiling, + VkImageUsageFlags usage, + VkImageCreateFlags flags, + VkExternalMemoryHandleTypeFlagsNV externalHandleType, + VkExternalImageFormatPropertiesNV* pExternalImageFormatProperties); +#endif + +#define VK_NV_external_memory 1 +#define VK_NV_EXTERNAL_MEMORY_SPEC_VERSION 1 +#define VK_NV_EXTERNAL_MEMORY_EXTENSION_NAME "VK_NV_external_memory" + +typedef struct VkExternalMemoryImageCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagsNV handleTypes; +} VkExternalMemoryImageCreateInfoNV; + +typedef struct VkExportMemoryAllocateInfoNV { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagsNV handleTypes; +} VkExportMemoryAllocateInfoNV; + + + +#ifdef VK_USE_PLATFORM_WIN32_KHR +#define VK_NV_external_memory_win32 1 +#define VK_NV_EXTERNAL_MEMORY_WIN32_SPEC_VERSION 1 +#define VK_NV_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME "VK_NV_external_memory_win32" + +typedef struct VkImportMemoryWin32HandleInfoNV { + VkStructureType sType; + const void* pNext; + VkExternalMemoryHandleTypeFlagsNV handleType; + HANDLE handle; +} VkImportMemoryWin32HandleInfoNV; + +typedef struct VkExportMemoryWin32HandleInfoNV { + VkStructureType sType; + const void* pNext; + const SECURITY_ATTRIBUTES* pAttributes; + DWORD dwAccess; +} VkExportMemoryWin32HandleInfoNV; + + +typedef VkResult (VKAPI_PTR *PFN_vkGetMemoryWin32HandleNV)(VkDevice device, VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagsNV handleType, HANDLE* pHandle); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetMemoryWin32HandleNV( + VkDevice device, + VkDeviceMemory memory, + VkExternalMemoryHandleTypeFlagsNV handleType, + HANDLE* pHandle); +#endif +#endif /* VK_USE_PLATFORM_WIN32_KHR */ + +#ifdef VK_USE_PLATFORM_WIN32_KHR +#define VK_NV_win32_keyed_mutex 1 +#define VK_NV_WIN32_KEYED_MUTEX_SPEC_VERSION 1 +#define VK_NV_WIN32_KEYED_MUTEX_EXTENSION_NAME "VK_NV_win32_keyed_mutex" + +typedef struct VkWin32KeyedMutexAcquireReleaseInfoNV { + VkStructureType sType; + const void* pNext; + uint32_t acquireCount; + const VkDeviceMemory* pAcquireSyncs; + const uint64_t* pAcquireKeys; + const uint32_t* pAcquireTimeoutMilliseconds; + uint32_t releaseCount; + const VkDeviceMemory* pReleaseSyncs; + const uint64_t* pReleaseKeys; +} VkWin32KeyedMutexAcquireReleaseInfoNV; + + +#endif /* VK_USE_PLATFORM_WIN32_KHR */ + +#define VK_KHX_device_group 1 +#define VK_MAX_DEVICE_GROUP_SIZE_KHX 32 +#define VK_KHX_DEVICE_GROUP_SPEC_VERSION 1 +#define VK_KHX_DEVICE_GROUP_EXTENSION_NAME "VK_KHX_device_group" + + +typedef enum VkPeerMemoryFeatureFlagBitsKHX { + VK_PEER_MEMORY_FEATURE_COPY_SRC_BIT_KHX = 0x00000001, + VK_PEER_MEMORY_FEATURE_COPY_DST_BIT_KHX = 0x00000002, + VK_PEER_MEMORY_FEATURE_GENERIC_SRC_BIT_KHX = 0x00000004, + VK_PEER_MEMORY_FEATURE_GENERIC_DST_BIT_KHX = 0x00000008, + VK_PEER_MEMORY_FEATURE_FLAG_BITS_MAX_ENUM_KHX = 0x7FFFFFFF +} VkPeerMemoryFeatureFlagBitsKHX; +typedef VkFlags VkPeerMemoryFeatureFlagsKHX; + +typedef enum VkMemoryAllocateFlagBitsKHX { + VK_MEMORY_ALLOCATE_DEVICE_MASK_BIT_KHX = 0x00000001, + VK_MEMORY_ALLOCATE_FLAG_BITS_MAX_ENUM_KHX = 0x7FFFFFFF +} VkMemoryAllocateFlagBitsKHX; +typedef VkFlags VkMemoryAllocateFlagsKHX; + +typedef enum VkDeviceGroupPresentModeFlagBitsKHX { + VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_BIT_KHX = 0x00000001, + VK_DEVICE_GROUP_PRESENT_MODE_REMOTE_BIT_KHX = 0x00000002, + VK_DEVICE_GROUP_PRESENT_MODE_SUM_BIT_KHX = 0x00000004, + VK_DEVICE_GROUP_PRESENT_MODE_LOCAL_MULTI_DEVICE_BIT_KHX = 0x00000008, + VK_DEVICE_GROUP_PRESENT_MODE_FLAG_BITS_MAX_ENUM_KHX = 0x7FFFFFFF +} VkDeviceGroupPresentModeFlagBitsKHX; +typedef VkFlags VkDeviceGroupPresentModeFlagsKHX; + +typedef struct VkMemoryAllocateFlagsInfoKHX { + VkStructureType sType; + const void* pNext; + VkMemoryAllocateFlagsKHX flags; + uint32_t deviceMask; +} VkMemoryAllocateFlagsInfoKHX; + +typedef struct VkBindBufferMemoryInfoKHX { + VkStructureType sType; + const void* pNext; + VkBuffer buffer; + VkDeviceMemory memory; + VkDeviceSize memoryOffset; + uint32_t deviceIndexCount; + const uint32_t* pDeviceIndices; +} VkBindBufferMemoryInfoKHX; + +typedef struct VkBindImageMemoryInfoKHX { + VkStructureType sType; + const void* pNext; + VkImage image; + VkDeviceMemory memory; + VkDeviceSize memoryOffset; + uint32_t deviceIndexCount; + const uint32_t* pDeviceIndices; + uint32_t SFRRectCount; + const VkRect2D* pSFRRects; +} VkBindImageMemoryInfoKHX; + +typedef struct VkDeviceGroupRenderPassBeginInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t deviceMask; + uint32_t deviceRenderAreaCount; + const VkRect2D* pDeviceRenderAreas; +} VkDeviceGroupRenderPassBeginInfoKHX; + +typedef struct VkDeviceGroupCommandBufferBeginInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t deviceMask; +} VkDeviceGroupCommandBufferBeginInfoKHX; + +typedef struct VkDeviceGroupSubmitInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t waitSemaphoreCount; + const uint32_t* pWaitSemaphoreDeviceIndices; + uint32_t commandBufferCount; + const uint32_t* pCommandBufferDeviceMasks; + uint32_t signalSemaphoreCount; + const uint32_t* pSignalSemaphoreDeviceIndices; +} VkDeviceGroupSubmitInfoKHX; + +typedef struct VkDeviceGroupBindSparseInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t resourceDeviceIndex; + uint32_t memoryDeviceIndex; +} VkDeviceGroupBindSparseInfoKHX; + +typedef struct VkDeviceGroupPresentCapabilitiesKHX { + VkStructureType sType; + const void* pNext; + uint32_t presentMask[VK_MAX_DEVICE_GROUP_SIZE_KHX]; + VkDeviceGroupPresentModeFlagsKHX modes; +} VkDeviceGroupPresentCapabilitiesKHX; + +typedef struct VkImageSwapchainCreateInfoKHX { + VkStructureType sType; + const void* pNext; + VkSwapchainKHR swapchain; +} VkImageSwapchainCreateInfoKHX; + +typedef struct VkBindImageMemorySwapchainInfoKHX { + VkStructureType sType; + const void* pNext; + VkSwapchainKHR swapchain; + uint32_t imageIndex; +} VkBindImageMemorySwapchainInfoKHX; + +typedef struct VkAcquireNextImageInfoKHX { + VkStructureType sType; + const void* pNext; + VkSwapchainKHR swapchain; + uint64_t timeout; + VkSemaphore semaphore; + VkFence fence; + uint32_t deviceMask; +} VkAcquireNextImageInfoKHX; + +typedef struct VkDeviceGroupPresentInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t swapchainCount; + const uint32_t* pDeviceMasks; + VkDeviceGroupPresentModeFlagBitsKHX mode; +} VkDeviceGroupPresentInfoKHX; + +typedef struct VkDeviceGroupSwapchainCreateInfoKHX { + VkStructureType sType; + const void* pNext; + VkDeviceGroupPresentModeFlagsKHX modes; +} VkDeviceGroupSwapchainCreateInfoKHX; + + +typedef void (VKAPI_PTR *PFN_vkGetDeviceGroupPeerMemoryFeaturesKHX)(VkDevice device, uint32_t heapIndex, uint32_t localDeviceIndex, uint32_t remoteDeviceIndex, VkPeerMemoryFeatureFlagsKHX* pPeerMemoryFeatures); +typedef VkResult (VKAPI_PTR *PFN_vkBindBufferMemory2KHX)(VkDevice device, uint32_t bindInfoCount, const VkBindBufferMemoryInfoKHX* pBindInfos); +typedef VkResult (VKAPI_PTR *PFN_vkBindImageMemory2KHX)(VkDevice device, uint32_t bindInfoCount, const VkBindImageMemoryInfoKHX* pBindInfos); +typedef void (VKAPI_PTR *PFN_vkCmdSetDeviceMaskKHX)(VkCommandBuffer commandBuffer, uint32_t deviceMask); +typedef VkResult (VKAPI_PTR *PFN_vkGetDeviceGroupPresentCapabilitiesKHX)(VkDevice device, VkDeviceGroupPresentCapabilitiesKHX* pDeviceGroupPresentCapabilities); +typedef VkResult (VKAPI_PTR *PFN_vkGetDeviceGroupSurfacePresentModesKHX)(VkDevice device, VkSurfaceKHR surface, VkDeviceGroupPresentModeFlagsKHX* pModes); +typedef VkResult (VKAPI_PTR *PFN_vkAcquireNextImage2KHX)(VkDevice device, const VkAcquireNextImageInfoKHX* pAcquireInfo, uint32_t* pImageIndex); +typedef void (VKAPI_PTR *PFN_vkCmdDispatchBaseKHX)(VkCommandBuffer commandBuffer, uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ); +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDevicePresentRectanglesKHX)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, uint32_t* pRectCount, VkRect2D* pRects); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkGetDeviceGroupPeerMemoryFeaturesKHX( + VkDevice device, + uint32_t heapIndex, + uint32_t localDeviceIndex, + uint32_t remoteDeviceIndex, + VkPeerMemoryFeatureFlagsKHX* pPeerMemoryFeatures); + +VKAPI_ATTR VkResult VKAPI_CALL vkBindBufferMemory2KHX( + VkDevice device, + uint32_t bindInfoCount, + const VkBindBufferMemoryInfoKHX* pBindInfos); + +VKAPI_ATTR VkResult VKAPI_CALL vkBindImageMemory2KHX( + VkDevice device, + uint32_t bindInfoCount, + const VkBindImageMemoryInfoKHX* pBindInfos); + +VKAPI_ATTR void VKAPI_CALL vkCmdSetDeviceMaskKHX( + VkCommandBuffer commandBuffer, + uint32_t deviceMask); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceGroupPresentCapabilitiesKHX( + VkDevice device, + VkDeviceGroupPresentCapabilitiesKHX* pDeviceGroupPresentCapabilities); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetDeviceGroupSurfacePresentModesKHX( + VkDevice device, + VkSurfaceKHR surface, + VkDeviceGroupPresentModeFlagsKHX* pModes); + +VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImage2KHX( + VkDevice device, + const VkAcquireNextImageInfoKHX* pAcquireInfo, + uint32_t* pImageIndex); + +VKAPI_ATTR void VKAPI_CALL vkCmdDispatchBaseKHX( + VkCommandBuffer commandBuffer, + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDevicePresentRectanglesKHX( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t* pRectCount, + VkRect2D* pRects); +#endif + +#define VK_EXT_validation_flags 1 +#define VK_EXT_VALIDATION_FLAGS_SPEC_VERSION 1 +#define VK_EXT_VALIDATION_FLAGS_EXTENSION_NAME "VK_EXT_validation_flags" + + +typedef enum VkValidationCheckEXT { + VK_VALIDATION_CHECK_ALL_EXT = 0, + VK_VALIDATION_CHECK_SHADERS_EXT = 1, + VK_VALIDATION_CHECK_BEGIN_RANGE_EXT = VK_VALIDATION_CHECK_ALL_EXT, + VK_VALIDATION_CHECK_END_RANGE_EXT = VK_VALIDATION_CHECK_SHADERS_EXT, + VK_VALIDATION_CHECK_RANGE_SIZE_EXT = (VK_VALIDATION_CHECK_SHADERS_EXT - VK_VALIDATION_CHECK_ALL_EXT + 1), + VK_VALIDATION_CHECK_MAX_ENUM_EXT = 0x7FFFFFFF +} VkValidationCheckEXT; typedef struct VkValidationFlagsEXT { VkStructureType sType; const void* pNext; - uint32_t disabledValidationCheckCount; - VkValidationCheckEXT* pDisabledValidationChecks; -} VkValidationFlagsEXT; + uint32_t disabledValidationCheckCount; + VkValidationCheckEXT* pDisabledValidationChecks; +} VkValidationFlagsEXT; + + + +#ifdef VK_USE_PLATFORM_VI_NN +#define VK_NN_vi_surface 1 +#define VK_NN_VI_SURFACE_SPEC_VERSION 1 +#define VK_NN_VI_SURFACE_EXTENSION_NAME "VK_NN_vi_surface" + +typedef VkFlags VkViSurfaceCreateFlagsNN; + +typedef struct VkViSurfaceCreateInfoNN { + VkStructureType sType; + const void* pNext; + VkViSurfaceCreateFlagsNN flags; + void* window; +} VkViSurfaceCreateInfoNN; + + +typedef VkResult (VKAPI_PTR *PFN_vkCreateViSurfaceNN)(VkInstance instance, const VkViSurfaceCreateInfoNN* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkCreateViSurfaceNN( + VkInstance instance, + const VkViSurfaceCreateInfoNN* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface); +#endif +#endif /* VK_USE_PLATFORM_VI_NN */ + +#define VK_EXT_shader_subgroup_ballot 1 +#define VK_EXT_SHADER_SUBGROUP_BALLOT_SPEC_VERSION 1 +#define VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME "VK_EXT_shader_subgroup_ballot" + + +#define VK_EXT_shader_subgroup_vote 1 +#define VK_EXT_SHADER_SUBGROUP_VOTE_SPEC_VERSION 1 +#define VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME "VK_EXT_shader_subgroup_vote" + + +#define VK_KHX_device_group_creation 1 +#define VK_KHX_DEVICE_GROUP_CREATION_SPEC_VERSION 1 +#define VK_KHX_DEVICE_GROUP_CREATION_EXTENSION_NAME "VK_KHX_device_group_creation" + +typedef struct VkPhysicalDeviceGroupPropertiesKHX { + VkStructureType sType; + void* pNext; + uint32_t physicalDeviceCount; + VkPhysicalDevice physicalDevices[VK_MAX_DEVICE_GROUP_SIZE_KHX]; + VkBool32 subsetAllocation; +} VkPhysicalDeviceGroupPropertiesKHX; + +typedef struct VkDeviceGroupDeviceCreateInfoKHX { + VkStructureType sType; + const void* pNext; + uint32_t physicalDeviceCount; + const VkPhysicalDevice* pPhysicalDevices; +} VkDeviceGroupDeviceCreateInfoKHX; + + +typedef VkResult (VKAPI_PTR *PFN_vkEnumeratePhysicalDeviceGroupsKHX)(VkInstance instance, uint32_t* pPhysicalDeviceGroupCount, VkPhysicalDeviceGroupPropertiesKHX* pPhysicalDeviceGroupProperties); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDeviceGroupsKHX( + VkInstance instance, + uint32_t* pPhysicalDeviceGroupCount, + VkPhysicalDeviceGroupPropertiesKHX* pPhysicalDeviceGroupProperties); +#endif + +#define VK_NVX_device_generated_commands 1 +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkObjectTableNVX) +VK_DEFINE_NON_DISPATCHABLE_HANDLE(VkIndirectCommandsLayoutNVX) + +#define VK_NVX_DEVICE_GENERATED_COMMANDS_SPEC_VERSION 3 +#define VK_NVX_DEVICE_GENERATED_COMMANDS_EXTENSION_NAME "VK_NVX_device_generated_commands" + + +typedef enum VkIndirectCommandsTokenTypeNVX { + VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX = 0, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_DESCRIPTOR_SET_NVX = 1, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NVX = 2, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NVX = 3, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NVX = 4, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NVX = 5, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NVX = 6, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX = 7, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_BEGIN_RANGE_NVX = VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_END_RANGE_NVX = VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX, + VK_INDIRECT_COMMANDS_TOKEN_TYPE_RANGE_SIZE_NVX = (VK_INDIRECT_COMMANDS_TOKEN_TYPE_DISPATCH_NVX - VK_INDIRECT_COMMANDS_TOKEN_TYPE_PIPELINE_NVX + 1), + VK_INDIRECT_COMMANDS_TOKEN_TYPE_MAX_ENUM_NVX = 0x7FFFFFFF +} VkIndirectCommandsTokenTypeNVX; + +typedef enum VkObjectEntryTypeNVX { + VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX = 0, + VK_OBJECT_ENTRY_TYPE_PIPELINE_NVX = 1, + VK_OBJECT_ENTRY_TYPE_INDEX_BUFFER_NVX = 2, + VK_OBJECT_ENTRY_TYPE_VERTEX_BUFFER_NVX = 3, + VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX = 4, + VK_OBJECT_ENTRY_TYPE_BEGIN_RANGE_NVX = VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX, + VK_OBJECT_ENTRY_TYPE_END_RANGE_NVX = VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX, + VK_OBJECT_ENTRY_TYPE_RANGE_SIZE_NVX = (VK_OBJECT_ENTRY_TYPE_PUSH_CONSTANT_NVX - VK_OBJECT_ENTRY_TYPE_DESCRIPTOR_SET_NVX + 1), + VK_OBJECT_ENTRY_TYPE_MAX_ENUM_NVX = 0x7FFFFFFF +} VkObjectEntryTypeNVX; + + +typedef enum VkIndirectCommandsLayoutUsageFlagBitsNVX { + VK_INDIRECT_COMMANDS_LAYOUT_USAGE_UNORDERED_SEQUENCES_BIT_NVX = 0x00000001, + VK_INDIRECT_COMMANDS_LAYOUT_USAGE_SPARSE_SEQUENCES_BIT_NVX = 0x00000002, + VK_INDIRECT_COMMANDS_LAYOUT_USAGE_EMPTY_EXECUTIONS_BIT_NVX = 0x00000004, + VK_INDIRECT_COMMANDS_LAYOUT_USAGE_INDEXED_SEQUENCES_BIT_NVX = 0x00000008, + VK_INDIRECT_COMMANDS_LAYOUT_USAGE_FLAG_BITS_MAX_ENUM_NVX = 0x7FFFFFFF +} VkIndirectCommandsLayoutUsageFlagBitsNVX; +typedef VkFlags VkIndirectCommandsLayoutUsageFlagsNVX; + +typedef enum VkObjectEntryUsageFlagBitsNVX { + VK_OBJECT_ENTRY_USAGE_GRAPHICS_BIT_NVX = 0x00000001, + VK_OBJECT_ENTRY_USAGE_COMPUTE_BIT_NVX = 0x00000002, + VK_OBJECT_ENTRY_USAGE_FLAG_BITS_MAX_ENUM_NVX = 0x7FFFFFFF +} VkObjectEntryUsageFlagBitsNVX; +typedef VkFlags VkObjectEntryUsageFlagsNVX; + +typedef struct VkDeviceGeneratedCommandsFeaturesNVX { + VkStructureType sType; + const void* pNext; + VkBool32 computeBindingPointSupport; +} VkDeviceGeneratedCommandsFeaturesNVX; + +typedef struct VkDeviceGeneratedCommandsLimitsNVX { + VkStructureType sType; + const void* pNext; + uint32_t maxIndirectCommandsLayoutTokenCount; + uint32_t maxObjectEntryCounts; + uint32_t minSequenceCountBufferOffsetAlignment; + uint32_t minSequenceIndexBufferOffsetAlignment; + uint32_t minCommandsTokenBufferOffsetAlignment; +} VkDeviceGeneratedCommandsLimitsNVX; + +typedef struct VkIndirectCommandsTokenNVX { + VkIndirectCommandsTokenTypeNVX tokenType; + VkBuffer buffer; + VkDeviceSize offset; +} VkIndirectCommandsTokenNVX; + +typedef struct VkIndirectCommandsLayoutTokenNVX { + VkIndirectCommandsTokenTypeNVX tokenType; + uint32_t bindingUnit; + uint32_t dynamicCount; + uint32_t divisor; +} VkIndirectCommandsLayoutTokenNVX; + +typedef struct VkIndirectCommandsLayoutCreateInfoNVX { + VkStructureType sType; + const void* pNext; + VkPipelineBindPoint pipelineBindPoint; + VkIndirectCommandsLayoutUsageFlagsNVX flags; + uint32_t tokenCount; + const VkIndirectCommandsLayoutTokenNVX* pTokens; +} VkIndirectCommandsLayoutCreateInfoNVX; + +typedef struct VkCmdProcessCommandsInfoNVX { + VkStructureType sType; + const void* pNext; + VkObjectTableNVX objectTable; + VkIndirectCommandsLayoutNVX indirectCommandsLayout; + uint32_t indirectCommandsTokenCount; + const VkIndirectCommandsTokenNVX* pIndirectCommandsTokens; + uint32_t maxSequencesCount; + VkCommandBuffer targetCommandBuffer; + VkBuffer sequencesCountBuffer; + VkDeviceSize sequencesCountOffset; + VkBuffer sequencesIndexBuffer; + VkDeviceSize sequencesIndexOffset; +} VkCmdProcessCommandsInfoNVX; + +typedef struct VkCmdReserveSpaceForCommandsInfoNVX { + VkStructureType sType; + const void* pNext; + VkObjectTableNVX objectTable; + VkIndirectCommandsLayoutNVX indirectCommandsLayout; + uint32_t maxSequencesCount; +} VkCmdReserveSpaceForCommandsInfoNVX; + +typedef struct VkObjectTableCreateInfoNVX { + VkStructureType sType; + const void* pNext; + uint32_t objectCount; + const VkObjectEntryTypeNVX* pObjectEntryTypes; + const uint32_t* pObjectEntryCounts; + const VkObjectEntryUsageFlagsNVX* pObjectEntryUsageFlags; + uint32_t maxUniformBuffersPerDescriptor; + uint32_t maxStorageBuffersPerDescriptor; + uint32_t maxStorageImagesPerDescriptor; + uint32_t maxSampledImagesPerDescriptor; + uint32_t maxPipelineLayouts; +} VkObjectTableCreateInfoNVX; + +typedef struct VkObjectTableEntryNVX { + VkObjectEntryTypeNVX type; + VkObjectEntryUsageFlagsNVX flags; +} VkObjectTableEntryNVX; + +typedef struct VkObjectTablePipelineEntryNVX { + VkObjectEntryTypeNVX type; + VkObjectEntryUsageFlagsNVX flags; + VkPipeline pipeline; +} VkObjectTablePipelineEntryNVX; + +typedef struct VkObjectTableDescriptorSetEntryNVX { + VkObjectEntryTypeNVX type; + VkObjectEntryUsageFlagsNVX flags; + VkPipelineLayout pipelineLayout; + VkDescriptorSet descriptorSet; +} VkObjectTableDescriptorSetEntryNVX; + +typedef struct VkObjectTableVertexBufferEntryNVX { + VkObjectEntryTypeNVX type; + VkObjectEntryUsageFlagsNVX flags; + VkBuffer buffer; +} VkObjectTableVertexBufferEntryNVX; + +typedef struct VkObjectTableIndexBufferEntryNVX { + VkObjectEntryTypeNVX type; + VkObjectEntryUsageFlagsNVX flags; + VkBuffer buffer; + VkIndexType indexType; +} VkObjectTableIndexBufferEntryNVX; + +typedef struct VkObjectTablePushConstantEntryNVX { + VkObjectEntryTypeNVX type; + VkObjectEntryUsageFlagsNVX flags; + VkPipelineLayout pipelineLayout; + VkShaderStageFlags stageFlags; +} VkObjectTablePushConstantEntryNVX; + + +typedef void (VKAPI_PTR *PFN_vkCmdProcessCommandsNVX)(VkCommandBuffer commandBuffer, const VkCmdProcessCommandsInfoNVX* pProcessCommandsInfo); +typedef void (VKAPI_PTR *PFN_vkCmdReserveSpaceForCommandsNVX)(VkCommandBuffer commandBuffer, const VkCmdReserveSpaceForCommandsInfoNVX* pReserveSpaceInfo); +typedef VkResult (VKAPI_PTR *PFN_vkCreateIndirectCommandsLayoutNVX)(VkDevice device, const VkIndirectCommandsLayoutCreateInfoNVX* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkIndirectCommandsLayoutNVX* pIndirectCommandsLayout); +typedef void (VKAPI_PTR *PFN_vkDestroyIndirectCommandsLayoutNVX)(VkDevice device, VkIndirectCommandsLayoutNVX indirectCommandsLayout, const VkAllocationCallbacks* pAllocator); +typedef VkResult (VKAPI_PTR *PFN_vkCreateObjectTableNVX)(VkDevice device, const VkObjectTableCreateInfoNVX* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkObjectTableNVX* pObjectTable); +typedef void (VKAPI_PTR *PFN_vkDestroyObjectTableNVX)(VkDevice device, VkObjectTableNVX objectTable, const VkAllocationCallbacks* pAllocator); +typedef VkResult (VKAPI_PTR *PFN_vkRegisterObjectsNVX)(VkDevice device, VkObjectTableNVX objectTable, uint32_t objectCount, const VkObjectTableEntryNVX* const* ppObjectTableEntries, const uint32_t* pObjectIndices); +typedef VkResult (VKAPI_PTR *PFN_vkUnregisterObjectsNVX)(VkDevice device, VkObjectTableNVX objectTable, uint32_t objectCount, const VkObjectEntryTypeNVX* pObjectEntryTypes, const uint32_t* pObjectIndices); +typedef void (VKAPI_PTR *PFN_vkGetPhysicalDeviceGeneratedCommandsPropertiesNVX)(VkPhysicalDevice physicalDevice, VkDeviceGeneratedCommandsFeaturesNVX* pFeatures, VkDeviceGeneratedCommandsLimitsNVX* pLimits); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdProcessCommandsNVX( + VkCommandBuffer commandBuffer, + const VkCmdProcessCommandsInfoNVX* pProcessCommandsInfo); + +VKAPI_ATTR void VKAPI_CALL vkCmdReserveSpaceForCommandsNVX( + VkCommandBuffer commandBuffer, + const VkCmdReserveSpaceForCommandsInfoNVX* pReserveSpaceInfo); + +VKAPI_ATTR VkResult VKAPI_CALL vkCreateIndirectCommandsLayoutNVX( + VkDevice device, + const VkIndirectCommandsLayoutCreateInfoNVX* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkIndirectCommandsLayoutNVX* pIndirectCommandsLayout); + +VKAPI_ATTR void VKAPI_CALL vkDestroyIndirectCommandsLayoutNVX( + VkDevice device, + VkIndirectCommandsLayoutNVX indirectCommandsLayout, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR VkResult VKAPI_CALL vkCreateObjectTableNVX( + VkDevice device, + const VkObjectTableCreateInfoNVX* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkObjectTableNVX* pObjectTable); + +VKAPI_ATTR void VKAPI_CALL vkDestroyObjectTableNVX( + VkDevice device, + VkObjectTableNVX objectTable, + const VkAllocationCallbacks* pAllocator); + +VKAPI_ATTR VkResult VKAPI_CALL vkRegisterObjectsNVX( + VkDevice device, + VkObjectTableNVX objectTable, + uint32_t objectCount, + const VkObjectTableEntryNVX* const* ppObjectTableEntries, + const uint32_t* pObjectIndices); + +VKAPI_ATTR VkResult VKAPI_CALL vkUnregisterObjectsNVX( + VkDevice device, + VkObjectTableNVX objectTable, + uint32_t objectCount, + const VkObjectEntryTypeNVX* pObjectEntryTypes, + const uint32_t* pObjectIndices); + +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceGeneratedCommandsPropertiesNVX( + VkPhysicalDevice physicalDevice, + VkDeviceGeneratedCommandsFeaturesNVX* pFeatures, + VkDeviceGeneratedCommandsLimitsNVX* pLimits); +#endif + +#define VK_NV_clip_space_w_scaling 1 +#define VK_NV_CLIP_SPACE_W_SCALING_SPEC_VERSION 1 +#define VK_NV_CLIP_SPACE_W_SCALING_EXTENSION_NAME "VK_NV_clip_space_w_scaling" + +typedef struct VkViewportWScalingNV { + float xcoeff; + float ycoeff; +} VkViewportWScalingNV; + +typedef struct VkPipelineViewportWScalingStateCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkBool32 viewportWScalingEnable; + uint32_t viewportCount; + const VkViewportWScalingNV* pViewportWScalings; +} VkPipelineViewportWScalingStateCreateInfoNV; + + +typedef void (VKAPI_PTR *PFN_vkCmdSetViewportWScalingNV)(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount, const VkViewportWScalingNV* pViewportWScalings); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdSetViewportWScalingNV( + VkCommandBuffer commandBuffer, + uint32_t firstViewport, + uint32_t viewportCount, + const VkViewportWScalingNV* pViewportWScalings); +#endif + +#define VK_EXT_direct_mode_display 1 +#define VK_EXT_DIRECT_MODE_DISPLAY_SPEC_VERSION 1 +#define VK_EXT_DIRECT_MODE_DISPLAY_EXTENSION_NAME "VK_EXT_direct_mode_display" + +typedef VkResult (VKAPI_PTR *PFN_vkReleaseDisplayEXT)(VkPhysicalDevice physicalDevice, VkDisplayKHR display); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkReleaseDisplayEXT( + VkPhysicalDevice physicalDevice, + VkDisplayKHR display); +#endif + +#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT +#define VK_EXT_acquire_xlib_display 1 +#include + +#define VK_EXT_ACQUIRE_XLIB_DISPLAY_SPEC_VERSION 1 +#define VK_EXT_ACQUIRE_XLIB_DISPLAY_EXTENSION_NAME "VK_EXT_acquire_xlib_display" + +typedef VkResult (VKAPI_PTR *PFN_vkAcquireXlibDisplayEXT)(VkPhysicalDevice physicalDevice, Display* dpy, VkDisplayKHR display); +typedef VkResult (VKAPI_PTR *PFN_vkGetRandROutputDisplayEXT)(VkPhysicalDevice physicalDevice, Display* dpy, RROutput rrOutput, VkDisplayKHR* pDisplay); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkAcquireXlibDisplayEXT( + VkPhysicalDevice physicalDevice, + Display* dpy, + VkDisplayKHR display); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetRandROutputDisplayEXT( + VkPhysicalDevice physicalDevice, + Display* dpy, + RROutput rrOutput, + VkDisplayKHR* pDisplay); +#endif +#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */ + +#define VK_EXT_display_surface_counter 1 +#define VK_EXT_DISPLAY_SURFACE_COUNTER_SPEC_VERSION 1 +#define VK_EXT_DISPLAY_SURFACE_COUNTER_EXTENSION_NAME "VK_EXT_display_surface_counter" +#define VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES2_EXT VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_EXT + + +typedef enum VkSurfaceCounterFlagBitsEXT { + VK_SURFACE_COUNTER_VBLANK_EXT = 0x00000001, + VK_SURFACE_COUNTER_FLAG_BITS_MAX_ENUM_EXT = 0x7FFFFFFF +} VkSurfaceCounterFlagBitsEXT; +typedef VkFlags VkSurfaceCounterFlagsEXT; + +typedef struct VkSurfaceCapabilities2EXT { + VkStructureType sType; + void* pNext; + uint32_t minImageCount; + uint32_t maxImageCount; + VkExtent2D currentExtent; + VkExtent2D minImageExtent; + VkExtent2D maxImageExtent; + uint32_t maxImageArrayLayers; + VkSurfaceTransformFlagsKHR supportedTransforms; + VkSurfaceTransformFlagBitsKHR currentTransform; + VkCompositeAlphaFlagsKHR supportedCompositeAlpha; + VkImageUsageFlags supportedUsageFlags; + VkSurfaceCounterFlagsEXT supportedSurfaceCounters; +} VkSurfaceCapabilities2EXT; + + +typedef VkResult (VKAPI_PTR *PFN_vkGetPhysicalDeviceSurfaceCapabilities2EXT)(VkPhysicalDevice physicalDevice, VkSurfaceKHR surface, VkSurfaceCapabilities2EXT* pSurfaceCapabilities); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilities2EXT( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + VkSurfaceCapabilities2EXT* pSurfaceCapabilities); +#endif + +#define VK_EXT_display_control 1 +#define VK_EXT_DISPLAY_CONTROL_SPEC_VERSION 1 +#define VK_EXT_DISPLAY_CONTROL_EXTENSION_NAME "VK_EXT_display_control" + + +typedef enum VkDisplayPowerStateEXT { + VK_DISPLAY_POWER_STATE_OFF_EXT = 0, + VK_DISPLAY_POWER_STATE_SUSPEND_EXT = 1, + VK_DISPLAY_POWER_STATE_ON_EXT = 2, + VK_DISPLAY_POWER_STATE_BEGIN_RANGE_EXT = VK_DISPLAY_POWER_STATE_OFF_EXT, + VK_DISPLAY_POWER_STATE_END_RANGE_EXT = VK_DISPLAY_POWER_STATE_ON_EXT, + VK_DISPLAY_POWER_STATE_RANGE_SIZE_EXT = (VK_DISPLAY_POWER_STATE_ON_EXT - VK_DISPLAY_POWER_STATE_OFF_EXT + 1), + VK_DISPLAY_POWER_STATE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkDisplayPowerStateEXT; + +typedef enum VkDeviceEventTypeEXT { + VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT = 0, + VK_DEVICE_EVENT_TYPE_BEGIN_RANGE_EXT = VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT, + VK_DEVICE_EVENT_TYPE_END_RANGE_EXT = VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT, + VK_DEVICE_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT - VK_DEVICE_EVENT_TYPE_DISPLAY_HOTPLUG_EXT + 1), + VK_DEVICE_EVENT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkDeviceEventTypeEXT; + +typedef enum VkDisplayEventTypeEXT { + VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT = 0, + VK_DISPLAY_EVENT_TYPE_BEGIN_RANGE_EXT = VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT, + VK_DISPLAY_EVENT_TYPE_END_RANGE_EXT = VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT, + VK_DISPLAY_EVENT_TYPE_RANGE_SIZE_EXT = (VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT - VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT + 1), + VK_DISPLAY_EVENT_TYPE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkDisplayEventTypeEXT; + +typedef struct VkDisplayPowerInfoEXT { + VkStructureType sType; + const void* pNext; + VkDisplayPowerStateEXT powerState; +} VkDisplayPowerInfoEXT; + +typedef struct VkDeviceEventInfoEXT { + VkStructureType sType; + const void* pNext; + VkDeviceEventTypeEXT deviceEvent; +} VkDeviceEventInfoEXT; + +typedef struct VkDisplayEventInfoEXT { + VkStructureType sType; + const void* pNext; + VkDisplayEventTypeEXT displayEvent; +} VkDisplayEventInfoEXT; + +typedef struct VkSwapchainCounterCreateInfoEXT { + VkStructureType sType; + const void* pNext; + VkSurfaceCounterFlagsEXT surfaceCounters; +} VkSwapchainCounterCreateInfoEXT; + + +typedef VkResult (VKAPI_PTR *PFN_vkDisplayPowerControlEXT)(VkDevice device, VkDisplayKHR display, const VkDisplayPowerInfoEXT* pDisplayPowerInfo); +typedef VkResult (VKAPI_PTR *PFN_vkRegisterDeviceEventEXT)(VkDevice device, const VkDeviceEventInfoEXT* pDeviceEventInfo, const VkAllocationCallbacks* pAllocator, VkFence* pFence); +typedef VkResult (VKAPI_PTR *PFN_vkRegisterDisplayEventEXT)(VkDevice device, VkDisplayKHR display, const VkDisplayEventInfoEXT* pDisplayEventInfo, const VkAllocationCallbacks* pAllocator, VkFence* pFence); +typedef VkResult (VKAPI_PTR *PFN_vkGetSwapchainCounterEXT)(VkDevice device, VkSwapchainKHR swapchain, VkSurfaceCounterFlagBitsEXT counter, uint64_t* pCounterValue); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkDisplayPowerControlEXT( + VkDevice device, + VkDisplayKHR display, + const VkDisplayPowerInfoEXT* pDisplayPowerInfo); + +VKAPI_ATTR VkResult VKAPI_CALL vkRegisterDeviceEventEXT( + VkDevice device, + const VkDeviceEventInfoEXT* pDeviceEventInfo, + const VkAllocationCallbacks* pAllocator, + VkFence* pFence); + +VKAPI_ATTR VkResult VKAPI_CALL vkRegisterDisplayEventEXT( + VkDevice device, + VkDisplayKHR display, + const VkDisplayEventInfoEXT* pDisplayEventInfo, + const VkAllocationCallbacks* pAllocator, + VkFence* pFence); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainCounterEXT( + VkDevice device, + VkSwapchainKHR swapchain, + VkSurfaceCounterFlagBitsEXT counter, + uint64_t* pCounterValue); +#endif + +#define VK_GOOGLE_display_timing 1 +#define VK_GOOGLE_DISPLAY_TIMING_SPEC_VERSION 1 +#define VK_GOOGLE_DISPLAY_TIMING_EXTENSION_NAME "VK_GOOGLE_display_timing" + +typedef struct VkRefreshCycleDurationGOOGLE { + uint64_t refreshDuration; +} VkRefreshCycleDurationGOOGLE; + +typedef struct VkPastPresentationTimingGOOGLE { + uint32_t presentID; + uint64_t desiredPresentTime; + uint64_t actualPresentTime; + uint64_t earliestPresentTime; + uint64_t presentMargin; +} VkPastPresentationTimingGOOGLE; + +typedef struct VkPresentTimeGOOGLE { + uint32_t presentID; + uint64_t desiredPresentTime; +} VkPresentTimeGOOGLE; + +typedef struct VkPresentTimesInfoGOOGLE { + VkStructureType sType; + const void* pNext; + uint32_t swapchainCount; + const VkPresentTimeGOOGLE* pTimes; +} VkPresentTimesInfoGOOGLE; + + +typedef VkResult (VKAPI_PTR *PFN_vkGetRefreshCycleDurationGOOGLE)(VkDevice device, VkSwapchainKHR swapchain, VkRefreshCycleDurationGOOGLE* pDisplayTimingProperties); +typedef VkResult (VKAPI_PTR *PFN_vkGetPastPresentationTimingGOOGLE)(VkDevice device, VkSwapchainKHR swapchain, uint32_t* pPresentationTimingCount, VkPastPresentationTimingGOOGLE* pPresentationTimings); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkGetRefreshCycleDurationGOOGLE( + VkDevice device, + VkSwapchainKHR swapchain, + VkRefreshCycleDurationGOOGLE* pDisplayTimingProperties); + +VKAPI_ATTR VkResult VKAPI_CALL vkGetPastPresentationTimingGOOGLE( + VkDevice device, + VkSwapchainKHR swapchain, + uint32_t* pPresentationTimingCount, + VkPastPresentationTimingGOOGLE* pPresentationTimings); +#endif + +#define VK_NV_sample_mask_override_coverage 1 +#define VK_NV_SAMPLE_MASK_OVERRIDE_COVERAGE_SPEC_VERSION 1 +#define VK_NV_SAMPLE_MASK_OVERRIDE_COVERAGE_EXTENSION_NAME "VK_NV_sample_mask_override_coverage" + + +#define VK_NV_geometry_shader_passthrough 1 +#define VK_NV_GEOMETRY_SHADER_PASSTHROUGH_SPEC_VERSION 1 +#define VK_NV_GEOMETRY_SHADER_PASSTHROUGH_EXTENSION_NAME "VK_NV_geometry_shader_passthrough" + + +#define VK_NV_viewport_array2 1 +#define VK_NV_VIEWPORT_ARRAY2_SPEC_VERSION 1 +#define VK_NV_VIEWPORT_ARRAY2_EXTENSION_NAME "VK_NV_viewport_array2" + + +#define VK_NVX_multiview_per_view_attributes 1 +#define VK_NVX_MULTIVIEW_PER_VIEW_ATTRIBUTES_SPEC_VERSION 1 +#define VK_NVX_MULTIVIEW_PER_VIEW_ATTRIBUTES_EXTENSION_NAME "VK_NVX_multiview_per_view_attributes" + +typedef struct VkPhysicalDeviceMultiviewPerViewAttributesPropertiesNVX { + VkStructureType sType; + void* pNext; + VkBool32 perViewPositionAllComponents; +} VkPhysicalDeviceMultiviewPerViewAttributesPropertiesNVX; + + + +#define VK_NV_viewport_swizzle 1 +#define VK_NV_VIEWPORT_SWIZZLE_SPEC_VERSION 1 +#define VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME "VK_NV_viewport_swizzle" + + +typedef enum VkViewportCoordinateSwizzleNV { + VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV = 0, + VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_X_NV = 1, + VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Y_NV = 2, + VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Y_NV = 3, + VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_Z_NV = 4, + VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_Z_NV = 5, + VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_W_NV = 6, + VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV = 7, + VK_VIEWPORT_COORDINATE_SWIZZLE_BEGIN_RANGE_NV = VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV, + VK_VIEWPORT_COORDINATE_SWIZZLE_END_RANGE_NV = VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV, + VK_VIEWPORT_COORDINATE_SWIZZLE_RANGE_SIZE_NV = (VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV - VK_VIEWPORT_COORDINATE_SWIZZLE_POSITIVE_X_NV + 1), + VK_VIEWPORT_COORDINATE_SWIZZLE_MAX_ENUM_NV = 0x7FFFFFFF +} VkViewportCoordinateSwizzleNV; + +typedef VkFlags VkPipelineViewportSwizzleStateCreateFlagsNV; + +typedef struct VkViewportSwizzleNV { + VkViewportCoordinateSwizzleNV x; + VkViewportCoordinateSwizzleNV y; + VkViewportCoordinateSwizzleNV z; + VkViewportCoordinateSwizzleNV w; +} VkViewportSwizzleNV; + +typedef struct VkPipelineViewportSwizzleStateCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkPipelineViewportSwizzleStateCreateFlagsNV flags; + uint32_t viewportCount; + const VkViewportSwizzleNV* pViewportSwizzles; +} VkPipelineViewportSwizzleStateCreateInfoNV; + + + +#define VK_EXT_discard_rectangles 1 +#define VK_EXT_DISCARD_RECTANGLES_SPEC_VERSION 1 +#define VK_EXT_DISCARD_RECTANGLES_EXTENSION_NAME "VK_EXT_discard_rectangles" + + +typedef enum VkDiscardRectangleModeEXT { + VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT = 0, + VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT = 1, + VK_DISCARD_RECTANGLE_MODE_BEGIN_RANGE_EXT = VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT, + VK_DISCARD_RECTANGLE_MODE_END_RANGE_EXT = VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT, + VK_DISCARD_RECTANGLE_MODE_RANGE_SIZE_EXT = (VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT - VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT + 1), + VK_DISCARD_RECTANGLE_MODE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkDiscardRectangleModeEXT; + +typedef VkFlags VkPipelineDiscardRectangleStateCreateFlagsEXT; + +typedef struct VkPhysicalDeviceDiscardRectanglePropertiesEXT { + VkStructureType sType; + void* pNext; + uint32_t maxDiscardRectangles; +} VkPhysicalDeviceDiscardRectanglePropertiesEXT; + +typedef struct VkPipelineDiscardRectangleStateCreateInfoEXT { + VkStructureType sType; + const void* pNext; + VkPipelineDiscardRectangleStateCreateFlagsEXT flags; + VkDiscardRectangleModeEXT discardRectangleMode; + uint32_t discardRectangleCount; + const VkRect2D* pDiscardRectangles; +} VkPipelineDiscardRectangleStateCreateInfoEXT; + + +typedef void (VKAPI_PTR *PFN_vkCmdSetDiscardRectangleEXT)(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle, uint32_t discardRectangleCount, const VkRect2D* pDiscardRectangles); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkCmdSetDiscardRectangleEXT( + VkCommandBuffer commandBuffer, + uint32_t firstDiscardRectangle, + uint32_t discardRectangleCount, + const VkRect2D* pDiscardRectangles); +#endif + +#define VK_EXT_swapchain_colorspace 1 +#define VK_EXT_SWAPCHAIN_COLOR_SPACE_SPEC_VERSION 3 +#define VK_EXT_SWAPCHAIN_COLOR_SPACE_EXTENSION_NAME "VK_EXT_swapchain_colorspace" + + +#define VK_EXT_hdr_metadata 1 +#define VK_EXT_HDR_METADATA_SPEC_VERSION 1 +#define VK_EXT_HDR_METADATA_EXTENSION_NAME "VK_EXT_hdr_metadata" + +typedef struct VkXYColorEXT { + float x; + float y; +} VkXYColorEXT; + +typedef struct VkHdrMetadataEXT { + VkStructureType sType; + const void* pNext; + VkXYColorEXT displayPrimaryRed; + VkXYColorEXT displayPrimaryGreen; + VkXYColorEXT displayPrimaryBlue; + VkXYColorEXT whitePoint; + float maxLuminance; + float minLuminance; + float maxContentLightLevel; + float maxFrameAverageLightLevel; +} VkHdrMetadataEXT; + + +typedef void (VKAPI_PTR *PFN_vkSetHdrMetadataEXT)(VkDevice device, uint32_t swapchainCount, const VkSwapchainKHR* pSwapchains, const VkHdrMetadataEXT* pMetadata); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR void VKAPI_CALL vkSetHdrMetadataEXT( + VkDevice device, + uint32_t swapchainCount, + const VkSwapchainKHR* pSwapchains, + const VkHdrMetadataEXT* pMetadata); +#endif + +#ifdef VK_USE_PLATFORM_IOS_MVK +#define VK_MVK_ios_surface 1 +#define VK_MVK_IOS_SURFACE_SPEC_VERSION 2 +#define VK_MVK_IOS_SURFACE_EXTENSION_NAME "VK_MVK_ios_surface" + +typedef VkFlags VkIOSSurfaceCreateFlagsMVK; + +typedef struct VkIOSSurfaceCreateInfoMVK { + VkStructureType sType; + const void* pNext; + VkIOSSurfaceCreateFlagsMVK flags; + const void* pView; +} VkIOSSurfaceCreateInfoMVK; + + +typedef VkResult (VKAPI_PTR *PFN_vkCreateIOSSurfaceMVK)(VkInstance instance, const VkIOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkCreateIOSSurfaceMVK( + VkInstance instance, + const VkIOSSurfaceCreateInfoMVK* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface); +#endif +#endif /* VK_USE_PLATFORM_IOS_MVK */ + +#ifdef VK_USE_PLATFORM_MACOS_MVK +#define VK_MVK_macos_surface 1 +#define VK_MVK_MACOS_SURFACE_SPEC_VERSION 2 +#define VK_MVK_MACOS_SURFACE_EXTENSION_NAME "VK_MVK_macos_surface" + +typedef VkFlags VkMacOSSurfaceCreateFlagsMVK; + +typedef struct VkMacOSSurfaceCreateInfoMVK { + VkStructureType sType; + const void* pNext; + VkMacOSSurfaceCreateFlagsMVK flags; + const void* pView; +} VkMacOSSurfaceCreateInfoMVK; + + +typedef VkResult (VKAPI_PTR *PFN_vkCreateMacOSSurfaceMVK)(VkInstance instance, const VkMacOSSurfaceCreateInfoMVK* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkSurfaceKHR* pSurface); + +#ifndef VK_NO_PROTOTYPES +VKAPI_ATTR VkResult VKAPI_CALL vkCreateMacOSSurfaceMVK( + VkInstance instance, + const VkMacOSSurfaceCreateInfoMVK* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface); +#endif +#endif /* VK_USE_PLATFORM_MACOS_MVK */ + +#define VK_EXT_sampler_filter_minmax 1 +#define VK_EXT_SAMPLER_FILTER_MINMAX_SPEC_VERSION 1 +#define VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME "VK_EXT_sampler_filter_minmax" + + +typedef enum VkSamplerReductionModeEXT { + VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT = 0, + VK_SAMPLER_REDUCTION_MODE_MIN_EXT = 1, + VK_SAMPLER_REDUCTION_MODE_MAX_EXT = 2, + VK_SAMPLER_REDUCTION_MODE_BEGIN_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT, + VK_SAMPLER_REDUCTION_MODE_END_RANGE_EXT = VK_SAMPLER_REDUCTION_MODE_MAX_EXT, + VK_SAMPLER_REDUCTION_MODE_RANGE_SIZE_EXT = (VK_SAMPLER_REDUCTION_MODE_MAX_EXT - VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT + 1), + VK_SAMPLER_REDUCTION_MODE_MAX_ENUM_EXT = 0x7FFFFFFF +} VkSamplerReductionModeEXT; + +typedef struct VkSamplerReductionModeCreateInfoEXT { + VkStructureType sType; + const void* pNext; + VkSamplerReductionModeEXT reductionMode; +} VkSamplerReductionModeCreateInfoEXT; + +typedef struct VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT { + VkStructureType sType; + void* pNext; + VkBool32 filterMinmaxSingleComponentFormats; + VkBool32 filterMinmaxImageComponentMapping; +} VkPhysicalDeviceSamplerFilterMinmaxPropertiesEXT; + + + +#define VK_AMD_gpu_shader_int16 1 +#define VK_AMD_GPU_SHADER_INT16_SPEC_VERSION 1 +#define VK_AMD_GPU_SHADER_INT16_EXTENSION_NAME "VK_AMD_gpu_shader_int16" + + +#define VK_AMD_mixed_attachment_samples 1 +#define VK_AMD_MIXED_ATTACHMENT_SAMPLES_SPEC_VERSION 1 +#define VK_AMD_MIXED_ATTACHMENT_SAMPLES_EXTENSION_NAME "VK_AMD_mixed_attachment_samples" + + +#define VK_EXT_shader_stencil_export 1 +#define VK_EXT_SHADER_STENCIL_EXPORT_SPEC_VERSION 1 +#define VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME "VK_EXT_shader_stencil_export" + + +#define VK_EXT_blend_operation_advanced 1 +#define VK_EXT_BLEND_OPERATION_ADVANCED_SPEC_VERSION 2 +#define VK_EXT_BLEND_OPERATION_ADVANCED_EXTENSION_NAME "VK_EXT_blend_operation_advanced" + + +typedef enum VkBlendOverlapEXT { + VK_BLEND_OVERLAP_UNCORRELATED_EXT = 0, + VK_BLEND_OVERLAP_DISJOINT_EXT = 1, + VK_BLEND_OVERLAP_CONJOINT_EXT = 2, + VK_BLEND_OVERLAP_BEGIN_RANGE_EXT = VK_BLEND_OVERLAP_UNCORRELATED_EXT, + VK_BLEND_OVERLAP_END_RANGE_EXT = VK_BLEND_OVERLAP_CONJOINT_EXT, + VK_BLEND_OVERLAP_RANGE_SIZE_EXT = (VK_BLEND_OVERLAP_CONJOINT_EXT - VK_BLEND_OVERLAP_UNCORRELATED_EXT + 1), + VK_BLEND_OVERLAP_MAX_ENUM_EXT = 0x7FFFFFFF +} VkBlendOverlapEXT; + +typedef struct VkPhysicalDeviceBlendOperationAdvancedFeaturesEXT { + VkStructureType sType; + void* pNext; + VkBool32 advancedBlendCoherentOperations; +} VkPhysicalDeviceBlendOperationAdvancedFeaturesEXT; + +typedef struct VkPhysicalDeviceBlendOperationAdvancedPropertiesEXT { + VkStructureType sType; + void* pNext; + uint32_t advancedBlendMaxColorAttachments; + VkBool32 advancedBlendIndependentBlend; + VkBool32 advancedBlendNonPremultipliedSrcColor; + VkBool32 advancedBlendNonPremultipliedDstColor; + VkBool32 advancedBlendCorrelatedOverlap; + VkBool32 advancedBlendAllOperations; +} VkPhysicalDeviceBlendOperationAdvancedPropertiesEXT; + +typedef struct VkPipelineColorBlendAdvancedStateCreateInfoEXT { + VkStructureType sType; + const void* pNext; + VkBool32 srcPremultiplied; + VkBool32 dstPremultiplied; + VkBlendOverlapEXT blendOverlap; +} VkPipelineColorBlendAdvancedStateCreateInfoEXT; + + + +#define VK_NV_fragment_coverage_to_color 1 +#define VK_NV_FRAGMENT_COVERAGE_TO_COLOR_SPEC_VERSION 1 +#define VK_NV_FRAGMENT_COVERAGE_TO_COLOR_EXTENSION_NAME "VK_NV_fragment_coverage_to_color" + +typedef VkFlags VkPipelineCoverageToColorStateCreateFlagsNV; + +typedef struct VkPipelineCoverageToColorStateCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkPipelineCoverageToColorStateCreateFlagsNV flags; + VkBool32 coverageToColorEnable; + uint32_t coverageToColorLocation; +} VkPipelineCoverageToColorStateCreateInfoNV; + + + +#define VK_NV_framebuffer_mixed_samples 1 +#define VK_NV_FRAMEBUFFER_MIXED_SAMPLES_SPEC_VERSION 1 +#define VK_NV_FRAMEBUFFER_MIXED_SAMPLES_EXTENSION_NAME "VK_NV_framebuffer_mixed_samples" + + +typedef enum VkCoverageModulationModeNV { + VK_COVERAGE_MODULATION_MODE_NONE_NV = 0, + VK_COVERAGE_MODULATION_MODE_RGB_NV = 1, + VK_COVERAGE_MODULATION_MODE_ALPHA_NV = 2, + VK_COVERAGE_MODULATION_MODE_RGBA_NV = 3, + VK_COVERAGE_MODULATION_MODE_BEGIN_RANGE_NV = VK_COVERAGE_MODULATION_MODE_NONE_NV, + VK_COVERAGE_MODULATION_MODE_END_RANGE_NV = VK_COVERAGE_MODULATION_MODE_RGBA_NV, + VK_COVERAGE_MODULATION_MODE_RANGE_SIZE_NV = (VK_COVERAGE_MODULATION_MODE_RGBA_NV - VK_COVERAGE_MODULATION_MODE_NONE_NV + 1), + VK_COVERAGE_MODULATION_MODE_MAX_ENUM_NV = 0x7FFFFFFF +} VkCoverageModulationModeNV; + +typedef VkFlags VkPipelineCoverageModulationStateCreateFlagsNV; + +typedef struct VkPipelineCoverageModulationStateCreateInfoNV { + VkStructureType sType; + const void* pNext; + VkPipelineCoverageModulationStateCreateFlagsNV flags; + VkCoverageModulationModeNV coverageModulationMode; + VkBool32 coverageModulationTableEnable; + uint32_t coverageModulationTableCount; + const float* pCoverageModulationTable; +} VkPipelineCoverageModulationStateCreateInfoNV; + + + +#define VK_NV_fill_rectangle 1 +#define VK_NV_FILL_RECTANGLE_SPEC_VERSION 1 +#define VK_NV_FILL_RECTANGLE_EXTENSION_NAME "VK_NV_fill_rectangle" + + +#define VK_EXT_post_depth_coverage 1 +#define VK_EXT_POST_DEPTH_COVERAGE_SPEC_VERSION 1 +#define VK_EXT_POST_DEPTH_COVERAGE_EXTENSION_NAME "VK_EXT_post_depth_coverage" + +#define VK_EXT_shader_viewport_index_layer 1 +#define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_SPEC_VERSION 1 +#define VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME "VK_EXT_shader_viewport_index_layer" #ifdef __cplusplus From 5375a479aa467028f5f287f4864845b2e7deddd9 Mon Sep 17 00:00:00 2001 From: Guilherme Leobas Date: Wed, 9 Dec 2020 17:10:23 -0800 Subject: [PATCH 093/250] Add type annotations to conv-relu (#47680) Summary: Fixes https://github.com/pytorch/pytorch/issues/47679 Pull Request resolved: https://github.com/pytorch/pytorch/pull/47680 Reviewed By: zhangguanheng66 Differential Revision: D25416628 Pulled By: malfet fbshipit-source-id: 103bea1e8c300990f74689787a71b1cfe916cfef --- mypy.ini | 15 +++------------ torch/_lobpcg.py | 8 ++++---- torch/nn/intrinsic/quantized/modules/conv_relu.py | 6 +++--- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/mypy.ini b/mypy.ini index 0b9f5497162c..d5b1ed20e081 100644 --- a/mypy.ini +++ b/mypy.ini @@ -101,15 +101,15 @@ ignore_errors = True [mypy-torch.nn.quantized.modules.conv] ignore_errors = True -[mypy-torch._lobpcg] -ignore_errors = True - [mypy-torch._appdirs] ignore_errors = True [mypy-torch._utils] ignore_errors = True +[mypy-torch._overrides] +ignore_errors = True + [mypy-torch.utils.tensorboard._caffe2_graph] ignore_errors = True @@ -131,15 +131,6 @@ ignore_errors = True [mypy-torch.nn.quantized.modules.batchnorm] ignore_errors = True -[mypy-torch.nn.intrinsic.quantized.modules.conv_relu] -ignore_errors = True - -[mypy-torch.nn.intrinsic.quantized.modules.bn_relu] -ignore_errors = True - -[mypy-torch.nn.intrinsic.quantized.modules.linear_relu] -ignore_errors = True - [mypy-torch.nn.intrinsic.qat.modules.conv_fused] ignore_errors = True diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py index ec0ad81dced0..02b666493f9a 100644 --- a/torch/_lobpcg.py +++ b/torch/_lobpcg.py @@ -262,7 +262,7 @@ def _symeig_backward(D_grad, U_grad, A, D, U, largest): class LOBPCGAutogradFunction(torch.autograd.Function): @staticmethod - def forward(ctx, + def forward(ctx, # type: ignore[override] A: Tensor, k: Optional[int] = None, B: Optional[Tensor] = None, @@ -606,7 +606,7 @@ def _lobpcg(A: Tensor, bparams['ortho_use_drop'] = bparams.get('ortho_use_drop', False) if not torch.jit.is_scripting(): - LOBPCG.call_tracker = LOBPCG_call_tracker + LOBPCG.call_tracker = LOBPCG_call_tracker # type: ignore if len(A.shape) > 2: N = int(torch.prod(torch.tensor(A.shape[:-2]))) @@ -628,7 +628,7 @@ def _lobpcg(A: Tensor, bXret[i] = worker.X[:, :k] if not torch.jit.is_scripting(): - LOBPCG.call_tracker = LOBPCG_call_tracker_orig + LOBPCG.call_tracker = LOBPCG_call_tracker_orig # type: ignore return bE.reshape(A.shape[:-2] + (k,)), bXret.reshape(A.shape[:-2] + (m, k)) @@ -640,7 +640,7 @@ def _lobpcg(A: Tensor, worker.run() if not torch.jit.is_scripting(): - LOBPCG.call_tracker = LOBPCG_call_tracker_orig + LOBPCG.call_tracker = LOBPCG_call_tracker_orig # type: ignore return worker.E[:k], worker.X[:, :k] diff --git a/torch/nn/intrinsic/quantized/modules/conv_relu.py b/torch/nn/intrinsic/quantized/modules/conv_relu.py index 76407062511f..8dd931ff05a8 100644 --- a/torch/nn/intrinsic/quantized/modules/conv_relu.py +++ b/torch/nn/intrinsic/quantized/modules/conv_relu.py @@ -16,7 +16,7 @@ class ConvReLU1d(nnq.Conv1d): Same as torch.nn.quantized.Conv1d """ - _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU1d + _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU1d # type: ignore[assignment] def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, @@ -55,7 +55,7 @@ class ConvReLU2d(nnq.Conv2d): Same as torch.nn.quantized.Conv2d """ - _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU2d + _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU2d # type: ignore[assignment] def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, @@ -94,7 +94,7 @@ class ConvReLU3d(nnq.Conv3d): Attributes: Same as torch.nn.quantized.Conv3d """ - _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU3d + _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU3d # type: ignore[assignment] def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, From e69c2f85f6ff14598614fe364b753b01581d723f Mon Sep 17 00:00:00 2001 From: Jonas Haag Date: Wed, 9 Dec 2020 17:42:55 -0800 Subject: [PATCH 094/250] Add version_info tuple (#48414) Summary: Add a `version_info` similar to `sys.version_info` for being able to make version tests. Example generated `version.py`: ``` __version__ = '1.8.0a0' version_info = (1, 8, 0, 'a0') # or version_info = (1, 8, 0, 'a0', 'deadbeef') if you're in a Git checkout debug = False cuda = None git_version = '671ee71ad4b6f507218d1cad278a8e743780b716' hip = None ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/48414 Reviewed By: zhangguanheng66 Differential Revision: D25416620 Pulled By: malfet fbshipit-source-id: 20b561a0c76ac0b16ff92f4bd43f8b724971e444 --- tools/generate_torch_version.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py index 8129f38eb0ef..17d0141105cd 100644 --- a/tools/generate_torch_version.py +++ b/tools/generate_torch_version.py @@ -24,7 +24,15 @@ def get_torch_version(sha=None): version += '.post' + str(build_number) elif sha != 'Unknown': version += '+' + sha[:7] - return version + + first_non_numeric = min(i for i, c in enumerate(version) if c not in "0123456789.") + version_suffix = version[first_non_numeric:] + version_info = tuple( + [int(part) for part in version[:first_non_numeric].split(".")] + + version_suffix.split("+") + ) + + return version, version_info if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate torch/version.py from build and environment metadata.") @@ -41,10 +49,11 @@ def get_torch_version(sha=None): pytorch_root = Path(__file__).parent.parent version_path = pytorch_root / "torch" / "version.py" sha = get_sha() - version = get_torch_version(sha) + version, version_info = get_torch_version(sha) with open(version_path, 'w') as f: f.write("__version__ = '{}'\n".format(version)) + f.write("version_info = {}\n".format(version_info)) # NB: This is not 100% accurate, because you could have built the # library code with DEBUG, but csrc without DEBUG (in which case # this would claim to be a release build when it's not.) From 3123f878dd848e56f9a54057a753c8e00c2703da Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 9 Dec 2020 17:43:51 -0800 Subject: [PATCH 095/250] [PyTorch] Avoid storage refcount bump in copy_tensor_metadata (#48877) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48877 Setting `Storage` in the TensorImpl ctor only to set it again in `copy_tensor_metadata` wastes one refcount bump. ghstack-source-id: 117937872 Test Plan: internal benchmark. compared results with perf, saw 0.15% reduction in percent of total time spent in `TensorImpl::shallow_copy_and_detach`. Reviewed By: bhosmer Differential Revision: D25353529 fbshipit-source-id: e85d3a139ccd44cbd059c14edb19b22b962881a9 --- c10/core/TensorImpl.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index a9e8f1f6853f..e305f352d7cb 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -293,7 +293,8 @@ c10::intrusive_ptr TensorImpl::shallow_copy_and_detach( const c10::VariableVersion& version_counter, bool allow_tensor_metadata_change) const { auto impl = c10::make_intrusive( - Storage(storage()), key_set_, data_type_); + // No need to populate Storage; copy_tensor_metadata will do it for us. + key_set_, data_type_, device_opt_); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), @@ -308,7 +309,8 @@ c10::intrusive_ptr TensorImpl::shallow_copy_and_detach( c10::VariableVersion&& version_counter, bool allow_tensor_metadata_change) const { auto impl = c10::make_intrusive( - Storage(storage()), key_set_, data_type_); + // No need to populate Storage; copy_tensor_metadata will do it for us. + key_set_, data_type_, device_opt_); copy_tensor_metadata( /*src_impl=*/this, /*dest_impl=*/impl.get(), From 7a2abbd8fdcbb51fba0fd420f2861a4b56adb45b Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 9 Dec 2020 18:39:34 -0800 Subject: [PATCH 096/250] Revert D25416620: [pytorch][PR] Add version_info tuple Test Plan: revert-hammer Differential Revision: D25416620 (https://github.com/pytorch/pytorch/commit/e69c2f85f6ff14598614fe364b753b01581d723f) Original commit changeset: 20b561a0c76a fbshipit-source-id: 4d73c7ed9191137d5be92236c18c312ce25a1471 --- tools/generate_torch_version.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tools/generate_torch_version.py b/tools/generate_torch_version.py index 17d0141105cd..8129f38eb0ef 100644 --- a/tools/generate_torch_version.py +++ b/tools/generate_torch_version.py @@ -24,15 +24,7 @@ def get_torch_version(sha=None): version += '.post' + str(build_number) elif sha != 'Unknown': version += '+' + sha[:7] - - first_non_numeric = min(i for i, c in enumerate(version) if c not in "0123456789.") - version_suffix = version[first_non_numeric:] - version_info = tuple( - [int(part) for part in version[:first_non_numeric].split(".")] - + version_suffix.split("+") - ) - - return version, version_info + return version if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate torch/version.py from build and environment metadata.") @@ -49,11 +41,10 @@ def get_torch_version(sha=None): pytorch_root = Path(__file__).parent.parent version_path = pytorch_root / "torch" / "version.py" sha = get_sha() - version, version_info = get_torch_version(sha) + version = get_torch_version(sha) with open(version_path, 'w') as f: f.write("__version__ = '{}'\n".format(version)) - f.write("version_info = {}\n".format(version_info)) # NB: This is not 100% accurate, because you could have built the # library code with DEBUG, but csrc without DEBUG (in which case # this would claim to be a release build when it's not.) From c5bc6b40abc1a8875e21391961066a5f5d1e5054 Mon Sep 17 00:00:00 2001 From: Nick Gibson Date: Wed, 9 Dec 2020 18:42:30 -0800 Subject: [PATCH 097/250] [NNC] Dead Store Elimination (#49030) Summary: Adds a new optimization method to LoopNest which eliminates stores that do not contribute to any output. It's unlikely any of the lowerings of aten operators produce these stores yet, but this creates some wiggle room for transformations in the future. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49030 Reviewed By: tugsbayasgalan Differential Revision: D25434538 Pulled By: nickgg fbshipit-source-id: fa1ead82e6f7440cc783c6116b23d0b7a5b5db4b --- test/cpp/tensorexpr/test_loopnest.cpp | 108 +++++++++++++++++++++++++ torch/csrc/jit/tensorexpr/loopnest.cpp | 57 +++++++++++++ torch/csrc/jit/tensorexpr/loopnest.h | 1 + 3 files changed, 166 insertions(+) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index cc43c41bc180..19b466dfb604 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -3541,5 +3541,113 @@ TEST(LoopNest, CacheWritesSimple) { assertAllEqual(c_data, c_ref); } +TEST(LoopNest, DeadStoreElimination) { + KernelScope kernel_scope; + VarHandle y("y", kInt); + VarHandle x("x_tail", kInt); + BufHandle f("f", {26, 5}, kFloat); + BufHandle g("g", {26, 5}, kFloat); + ExprHandle x_outer_end = 5; + ExprHandle x_2 = x + x_outer_end * 4; + For* stmt1 = For::make( + x, + 0, + 5, + For::make( + y, + 0, + 5, + Block::make({ + Store::make(f, {x_2, y}, (x_2 + y), 1), + Store::make(g, {x_2, y}, (x_2 * y), 1), + }))); + Stmt* stmt = Block::make({stmt1}); + + // Will eliminate if not used by an output. + LoopNest loop(stmt, {f.node()}, {}, {}); + loop.eliminateDeadStores(); + + std::ostringstream oss; + oss << *loop.root_stmt(); + + const std::string& expected_ir = + R"IR( +#CHECK: f[x_tail + 5 * 4, y] +#CHECK-NOT: g[x_tail + 5 * 4, y] + )IR"; + torch::jit::testing::FileCheck().run(expected_ir, oss.str()); + + // But won't eliminate if used by different outputs. + LoopNest loop2(stmt, {f.node(), g.node()}, {}, {}); + loop2.eliminateDeadStores(); + + oss.clear(); + oss << *loop2.root_stmt(); + + const std::string& expected_ir2 = + R"IR( +#CHECK: f[x_tail + 5 * 4, y] +#CHECK: g[x_tail + 5 * 4, y] + )IR"; + torch::jit::testing::FileCheck().run(expected_ir2, oss.str()); +} + +TEST(LoopNest, DeadStoreEliminationWithIntermediates) { + KernelScope kernel_scope; + VarHandle x("x", kInt); + VarHandle y("y", kInt); + VarHandle z("z", kInt); + BufHandle f("f", {26 * 5}, kFloat); + BufHandle g("g", {26 * 5}, kFloat); + BufHandle h("h", {26, 5}, kFloat); + ExprHandle x_outer_end = 5; + ExprHandle x_2 = x + x_outer_end * 4; + For* stmt1 = For::make(x, 0, 26 * 5, Store::make(f, {x}, x)); + For* stmt2 = For::make(z, 0, 26 * 5, Store::make(g, {z}, z + 1)); + For* stmt3 = For::make( + x, + 0, + 5, + For::make( + y, + 0, + 5, + Block::make({ + Store::make(h, {x, y}, Load::make(f, {x * y}, 1), 1), + }))); + Stmt* stmt = Block::make({stmt1, stmt2, stmt3}); + + // Will eliminate the write to g, but not f since it used by the producer of + // h. + LoopNest loop(stmt, {h.node()}, {}, {}); + loop.eliminateDeadStores(); + + std::ostringstream oss; + oss << *loop.root_stmt(); + + const std::string& expected_ir = + R"IR( + #CHECK: f[x] = x; + #CHECK-NOT: g[z] = + #CHECK: h[x, y] = f[x * y]; + )IR"; + torch::jit::testing::FileCheck().run(expected_ir, oss.str()); + + // Sanity check won't eliminate if g is an output. + LoopNest loop2(stmt, {h.node(), g.node()}, {}, {}); + loop2.eliminateDeadStores(); + + oss.clear(); + oss << *loop2.root_stmt(); + + const std::string& expected_ir2 = + R"IR( + #CHECK: f[x] = x; + #CHECK: g[z] = z + 1; + #CHECK: h[x, y] = f[x * y]; + )IR"; + torch::jit::testing::FileCheck().run(expected_ir2, oss.str()); +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index c9e17c9fc896..96df28625bec 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -868,6 +868,63 @@ Stmt* LoopNest::insertAllocFree(Stmt* stmt) { return b; } +class StmtDeleter : public IRMutator { + public: + StmtDeleter(const std::unordered_set& targets) + : targets_(targets) {} + + private: + Stmt* mutate(const Block* v) override { + std::vector stmts; + + for (auto* s : v->stmts()) { + if (targets_.count(s) == 0) { + Stmt* ns = s->accept_mutator(this); + if (ns) { + stmts.push_back(Stmt::clone(ns)); + } + } + } + + return Block::make(stmts); + } + + const std::unordered_set& targets_; +}; + +void LoopNest::eliminateDeadStores() { + using namespace analysis; + MemDependencyChecker checker(getInputBufs(), getOutputBufs()); + root_stmt_->accept(&checker); + + std::unordered_set deadStores; + std::vector> outputAccesses; + for (auto* o : getOutputBufs()) { + outputAccesses.push_back(checker.output(o)); + } + + for (auto& info : checker.getHistory()) { + if (!info->isWrite()) { + continue; + } + bool found = false; + + for (auto& output : outputAccesses) { + if (checker.dependsIndirectly(output, info)) { + found = true; + break; + } + } + + if (!found) { + deadStores.insert(info->stmt()); + } + } + + StmtDeleter deleter(deadStores); + root_stmt_ = root_stmt_->accept_mutator(&deleter); +} + void LoopNest::prepareForCodegen() { // Expand reduction ops. ReductionExpander reduceExpander; diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h index af0f28884f5a..540b7fa889a9 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.h +++ b/torch/csrc/jit/tensorexpr/loopnest.h @@ -107,6 +107,7 @@ class TORCH_API LoopNest { For* f, const std::unordered_map& map); + void eliminateDeadStores(); void prepareForCodegen(); // Find the inner-most loops and vectorize them. Currently, this only works From 9417e92722f1b6f75e162cbe661d3ed788d8d37a Mon Sep 17 00:00:00 2001 From: Xiaohan Wei Date: Wed, 9 Dec 2020 19:12:17 -0800 Subject: [PATCH 098/250] op to gen quant params from min-max thresholds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Adding support to gen qparams to quantize a tensor from min and max thresholds of a tensor Test Plan: ``` buck test mode/opt caffe2/caffe2/quantization/server:int8_gen_quant_params_min_max_test ``` ``` Started reporting to test run: https://our.intern.facebook.com/intern/testinfra/testrun/5629499573509506 ✓ ListingSuccess: caffe2/caffe2/quantization/server:int8_gen_quant_params_min_max_test - main (2.522) ✓ Pass: caffe2/caffe2/quantization/server:int8_gen_quant_params_min_max_test - test_int8_gen_quant_params_min_max_op (caffe2.caffe2.quantization.server.int8_gen_quant_params_min_max_test.TestInt8GenQuantParamsMinMaxOperator) (1.977) Summary Pass: 1 ListingSuccess: 1 ``` Reviewed By: hx89 Differential Revision: D24485985 fbshipit-source-id: 18dee193f7895295d85d31dc013570e5d5d97357 --- .../server/int8_gen_quant_params_min_max.cc | 37 +++++++++ .../server/int8_gen_quant_params_min_max.h | 50 +++++++++++ .../int8_gen_quant_params_min_max_test.py | 83 +++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 caffe2/quantization/server/int8_gen_quant_params_min_max.cc create mode 100644 caffe2/quantization/server/int8_gen_quant_params_min_max.h create mode 100644 caffe2/quantization/server/int8_gen_quant_params_min_max_test.py diff --git a/caffe2/quantization/server/int8_gen_quant_params_min_max.cc b/caffe2/quantization/server/int8_gen_quant_params_min_max.cc new file mode 100644 index 000000000000..76a2bb747242 --- /dev/null +++ b/caffe2/quantization/server/int8_gen_quant_params_min_max.cc @@ -0,0 +1,37 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#include "caffe2/quantization/server/int8_gen_quant_params_min_max.h" +#include +#include "caffe2/quantization/server/int8_gen_quant_params.h" + +namespace caffe2 { +using namespace std; +using namespace dnnlowp; + +REGISTER_CPU_OPERATOR( + Int8GenQuantParamsMinMax, + Int8GenQuantParamsMinMaxOp); +OPERATOR_SCHEMA(Int8GenQuantParamsMinMax) + .NumInputs(2, 3) + .NumOutputs(1) + .TensorInferenceFunction([](const OperatorDef& /* def */, + const vector& /* in */) { + vector out(1); + out[0].set_data_type(TensorProto_DataType_FLOAT); + out[0].add_dims(1); + return out; + }) + .Input(0, "min", "The lower bound of the tensor to be quantized.") + .Input(1, "max", "The upper bound of the tensor to be quantized.") + .Input( + 2, + "quant_scheme", + "(Optional) Int8QuantSchemeBlob that specifies the quantization kind and preserve_sparsity options when generating the quant params. We only use preserve_sparsity in this op which is default to be false.") + .Output( + 0, + "quant_param", + "Int8QuantParamsBlob that contains the scale and zero_point info in TensorQuantizationParams type.") + .SetDoc( + R"DOC(Operator wrapper for generating int8 tensor quantization parameters given lower and upper bound of the input tensor)DOC"); + +} // namespace caffe2 diff --git a/caffe2/quantization/server/int8_gen_quant_params_min_max.h b/caffe2/quantization/server/int8_gen_quant_params_min_max.h new file mode 100644 index 000000000000..ada6a46a8dec --- /dev/null +++ b/caffe2/quantization/server/int8_gen_quant_params_min_max.h @@ -0,0 +1,50 @@ +// Copyright 2004-present Facebook. All Rights Reserved. + +#pragma once +#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" +#include "caffe2/quantization/server/dnnlowp.h" +#include "caffe2/quantization/server/int8_gen_quant_params.h" +#include + + +namespace caffe2 { +using namespace std; +using dnnlowp::TensorQuantizationParams; + +template +class Int8GenQuantParamsMinMaxOp final : public Operator { + public: + USE_OPERATOR_CONTEXT_FUNCTIONS; + Int8GenQuantParamsMinMaxOp(const OperatorDef& operator_def, Workspace* ws) + : Operator(operator_def, ws) {} + bool RunOnDevice() override { + // Generate Int8 quant params based on the input data (last N samples of the + // activations) and the quant scheme + const float min = + OperatorBase::Input(0, CPU).template data()[0]; + const float max = + OperatorBase::Input(1, CPU).template data()[0]; + bool preserve_sparsity = false; + if (InputSize() == 3){ + const auto* quant_scheme = + this->template Input>(2).get(); + preserve_sparsity = quant_scheme->preserve_sparsity_; + } + dnnlowp::QuantizationFactory* qfactory = + dnnlowp::QuantizationFactory::GetDefaultInstance(); + TensorQuantizationParams qparam = qfactory->ChooseQuantizationParams( + min, + max, + 8, + preserve_sparsity); + auto* output_qparam = + this->template Output>(0); + output_qparam->reset( + new Int8QuantParamsBlob(qparam.scale, qparam.zero_point)); + LOG_EVERY_N(INFO, 1) << "scale and bias are " << qparam.scale << "," << qparam.zero_point; + return true; + } + +}; // class Int8GenQuantParamsOp + +} // namespace caffe2 diff --git a/caffe2/quantization/server/int8_gen_quant_params_min_max_test.py b/caffe2/quantization/server/int8_gen_quant_params_min_max_test.py new file mode 100644 index 000000000000..dd27074db5c4 --- /dev/null +++ b/caffe2/quantization/server/int8_gen_quant_params_min_max_test.py @@ -0,0 +1,83 @@ +# Copyright (c) 2016-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + + + +import caffe2.python.hypothesis_test_util as hu +import hypothesis.strategies as st +import numpy as np +from caffe2.python import core, workspace +from caffe2.quantization.server import dnnlowp_pybind11 +from hypothesis import given, settings + + +class TestInt8GenQuantParamsMinMaxOperator(hu.HypothesisTestCase): + @settings(max_examples=20, deadline=None) + @given( + n=st.integers(10, 10), + m=st.integers(10, 10), + preserve_sparsity=st.booleans(), + rnd_seed=st.integers(1, 5), + **hu.gcs_cpu_only + ) + def test_int8_gen_quant_params_min_max_op( + self, n, m, preserve_sparsity, rnd_seed, gc, dc + ): + X_min = 0 if preserve_sparsity else -77 + X_max = X_min + 255 + np.random.seed(rnd_seed) + X = np.round(np.random.rand(n, m) * (X_max - X_min) + X_min).astype( + np.float32 + ) + # Calculate X_qparam + hist, bin_edges = np.histogram(X.flatten(), bins=2048) + X_qparam = dnnlowp_pybind11.ChooseStaticQuantizationParams( + np.min(X), np.max(X), hist, preserve_sparsity, 8, "MIN_MAX_QUANTIZATION" + ) + + # Build a net to generate X's qparam using the Int8GenQuantParamsMinMax op + workspace.FeedBlob("X", X, device_option=gc) + workspace.FeedBlob("X_min", np.array([np.min(X)]), device_option=gc) + workspace.FeedBlob("X_max", np.array([np.max(X)]), device_option=gc) + dnnlowp_pybind11.CreateInt8QuantSchemeBlob( + "quant_scheme", "MIN_MAX_QUANTIZATION", preserve_sparsity + ) + assert workspace.HasBlob( + "quant_scheme" + ), "Failed to create the quant_scheme blob in current workspace" + + gen_quant_params_net = core.Net("gen_quant_params_min_max") + gen_quant_params_op = core.CreateOperator( + "Int8GenQuantParamsMinMax", + ["X_min", "X_max", "quant_scheme"], + ["quant_param"], + device_option=gc, + ) + gen_quant_params_net.Proto().op.extend([gen_quant_params_op]) + assert workspace.RunNetOnce( + gen_quant_params_net + ), "Failed to run the gen_quant_params net" + scale, zero_point = dnnlowp_pybind11.ObserveInt8QuantParamsBlob("quant_param") + + shapes, types = workspace.InferShapesAndTypes( + [gen_quant_params_net], + blob_dimensions={"X": [n, m], "X_min": [1], "X_max": [1], "quant_scheme": [1]}, + blob_types={"X": core.DataType.FLOAT, "X_min": core.DataType.FLOAT, "X_max": core.DataType.FLOAT, "quant_scheme": core.DataType.STRING} + ) + self.assertEqual(shapes["quant_param"], [1]) + self.assertEqual(types["quant_param"], core.DataType.FLOAT) + + np.testing.assert_equal(scale, X_qparam.scale) + np.testing.assert_equal(zero_point, X_qparam.zero_point) From 95233870f284565df602471bf09b8aad565540b8 Mon Sep 17 00:00:00 2001 From: Xiong Zhang Date: Wed, 9 Dec 2020 22:50:03 -0800 Subject: [PATCH 099/250] [PyTorch Mobile] Preserve bundled input related methods when calling optimize_for_mobile Summary: Added an extra step to **always** preserve the bundled inputs methods if they are present in the input module. Also added a check to see if all the methods in the `preseved_methods` exist. If not, we will now throw an exception. This can hopefully stop hard-to-debug inputs from getting into downstream functions. ~~Add an optional argument `preserve_bundled_inputs_methods=False` to the `optimize_for_mobile` function. If set to be True, the function will now add three additional functions related with bundled inputs to be preserved: `get_all_bundled_inputs`, `get_num_bundled_inputs` and `run_on_bundled_input`.~~ Test Plan: `buck test mode/dev //caffe2/test:mobile -- 'test_preserve_bundled_inputs_methods \(test_mobile_optimizer\.TestOptimizer\)'` or `buck test caffe2/test:mobile` to run some other related tests as well. Reviewed By: dhruvbird Differential Revision: D25433268 fbshipit-source-id: 0bf9b4afe64b79ed1684a3db4c0baea40ed3cdd5 --- test/test_mobile_optimizer.py | 64 +++++++++++++++++++++++++++++++++ torch/utils/mobile_optimizer.py | 12 +++++++ 2 files changed, 76 insertions(+) diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py index 0af74eabdf2b..9bfe5465a458 100644 --- a/test/test_mobile_optimizer.py +++ b/test/test_mobile_optimizer.py @@ -8,6 +8,7 @@ from torch.nn import functional as F from torch._C import MobileOptimizerType from torch.testing._internal.common_quantized import override_quantized_engine +from torch.nn.modules.module import ModuleAttributeError FileCheck = torch._C.FileCheck @@ -268,6 +269,69 @@ def get_lint_count_by_type(lint_type, module_lint_List): bi_module_lint_list = generate_mobile_module_lints(bi_module) self.assertEqual(len(bi_module_lint_list), 0) + def test_preserve_bundled_inputs_methods(self): + class MyBundledInputModule(torch.nn.Module): + def __init__(self): + super(MyBundledInputModule, self).__init__() + + def forward(self, inputs): + return inputs + + class MyIncompleteBundledInputModule(torch.nn.Module): + def __init__(self): + super(MyIncompleteBundledInputModule, self).__init__() + + def forward(self, inputs): + return inputs + + @torch.jit.export + def get_all_bundled_inputs(self): + pass + + bi_module = torch.jit.script(MyBundledInputModule()) + module_optim_bi_not_preserved = optimize_for_mobile(bi_module) + + # Expected to be False since no bundled inputs methods were added + self.assertFalse( + hasattr(module_optim_bi_not_preserved, 'get_all_bundled_inputs') or + hasattr(module_optim_bi_not_preserved, 'get_num_bundled_inputs') or + hasattr(module_optim_bi_not_preserved, 'run_on_bundled_input') + ) + + # We expect an exception here + with self.assertRaises(ModuleAttributeError): + module_optim_bi_not_preserved.run_on_bundled_input(0) + + # Add bundled inputs methods to the module + torch.utils.bundled_inputs.augment_model_with_bundled_inputs( + bi_module, [(torch.tensor([1]),)], []) + # Now they should be preserved + module_optim_bi_preserved = optimize_for_mobile(bi_module) + + # All of the bundled inputs methods were preserved + self.assertTrue( + hasattr(module_optim_bi_preserved, 'get_all_bundled_inputs') and + hasattr(module_optim_bi_preserved, 'get_num_bundled_inputs') and + hasattr(module_optim_bi_preserved, 'run_on_bundled_input') + ) + + # We do not expect an exception here + module_optim_bi_preserved.run_on_bundled_input(0) + + bundled_input = module_optim_bi_preserved.get_all_bundled_inputs()[0] + module_optim_bi_preserved(*bundled_input) + + # If not all 3 bundled inputs methods are present in the module, + # we will not try to preserve them unless specified by the user. + incomplete_bi_module = torch.jit.script(MyIncompleteBundledInputModule()) + incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module) + self.assertFalse(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs')) + + # Specifically preserve get_all_bundled_inputs even if it's the only one + # bundled inputs method available. + incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module, preserved_methods=['get_all_bundled_inputs']) + self.assertTrue(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs')) + @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") diff --git a/torch/utils/mobile_optimizer.py b/torch/utils/mobile_optimizer.py index a9bbbfb9e6ac..8029084a77a7 100644 --- a/torch/utils/mobile_optimizer.py +++ b/torch/utils/mobile_optimizer.py @@ -39,6 +39,18 @@ def optimize_for_mobile( if preserved_methods is None: preserved_methods = [] + bundled_inputs_methods = ['get_all_bundled_inputs', 'get_num_bundled_inputs', 'run_on_bundled_input'] + if all([hasattr(script_module, method) for method in bundled_inputs_methods]): + preserved_methods = list(set(preserved_methods + bundled_inputs_methods)) + + non_exist_methods = [] + for method in preserved_methods: + if not hasattr(script_module, method): + non_exist_methods.append(method) + if non_exist_methods: + raise AttributeError( + 'The following methods to preserve do not exist in script_module: {}'.format(', '.join(non_exist_methods))) + backend = backend.lower() if backend == 'cpu': optimized_cpp_module = torch._C._jit_pass_optimize_for_mobile(script_module._c, optimization_blocklist, preserved_methods) From 27f7d1c2865355c694fb964609df45974748615b Mon Sep 17 00:00:00 2001 From: Kurt Mohler Date: Wed, 9 Dec 2020 23:26:06 -0800 Subject: [PATCH 100/250] Port `eig` CPU from TH to ATen (#43215) Summary: Also consolidates shared logic between `eig` CPU and CUDA implementations Fixes https://github.com/pytorch/pytorch/issues/24693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/43215 Reviewed By: VitalyFedyunin, zhangguanheng66 Differential Revision: D23862622 Pulled By: ngimel fbshipit-source-id: ca1002428850520cd74cd5b7ed8cb4d12dbd9c52 --- aten/src/ATen/LegacyTHFunctionsCPU.cpp | 47 ----------- aten/src/ATen/LegacyTHFunctionsCPU.h | 2 - aten/src/ATen/native/BatchLinearAlgebra.cpp | 53 ++++++++++++ aten/src/ATen/native/BatchLinearAlgebra.h | 24 ++++++ .../ATen/native/BatchLinearAlgebraKernel.cpp | 77 +++++++++++++++++ .../ATen/native/cuda/BatchLinearAlgebra.cu | 47 +---------- aten/src/ATen/native/native_functions.yaml | 6 +- aten/src/TH/generic/THLapack.cpp | 17 ---- aten/src/TH/generic/THLapack.h | 2 - aten/src/TH/generic/THTensorLapack.cpp | 82 ------------------- aten/src/TH/generic/THTensorLapack.h | 1 - aten/src/THC/generic/THCTensorMathMagma.h | 1 - test/test_linalg.py | 16 ++-- test/test_torch.py | 2 +- 14 files changed, 169 insertions(+), 208 deletions(-) create mode 100644 aten/src/ATen/native/BatchLinearAlgebra.h create mode 100644 aten/src/ATen/native/BatchLinearAlgebraKernel.cpp diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp index 5fcb5ede9cc5..a9198f7b2548 100644 --- a/aten/src/ATen/LegacyTHFunctionsCPU.cpp +++ b/aten/src/ATen/LegacyTHFunctionsCPU.cpp @@ -832,53 +832,6 @@ std::tuple _th_gels(const Tensor & self, const Tensor & A) { } return std::tuple(res1, res2); } -std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); - auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); - THDoubleTensor_geev(res1_, res2_, self_, eigenvectors); - break; - } - case ScalarType::Float: { - auto res1_ = checked_dense_tensor_unwrap(res1, "res1", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); - auto res2_ = checked_dense_tensor_unwrap(res2, "res2", 0, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig_out", false, DeviceType::CPU, dispatch_scalar_type); - THFloatTensor_geev(res1_, res2_, self_, eigenvectors); - break; - } - default: - AT_ERROR("_th_eig_out not supported on CPUType for ", dispatch_scalar_type); - } - return std::tuple(res1, res2); -} -std::tuple _th_eig(const Tensor & self, bool eigenvectors) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto res1_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto res1 = Tensor(c10::intrusive_ptr::reclaim(res1_)); - auto res2_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CPU, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto res2 = Tensor(c10::intrusive_ptr::reclaim(res2_)); - switch (dispatch_scalar_type) { - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CPU, dispatch_scalar_type); - THDoubleTensor_geev(res1_, res2_, self_, eigenvectors); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_eig", false, DeviceType::CPU, dispatch_scalar_type); - THFloatTensor_geev(res1_, res2_, self_, eigenvectors); - break; - } - default: - AT_ERROR("_th_eig not supported on CPUType for ", dispatch_scalar_type); - } - return std::tuple(res1, res2); -} Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h index 1aca02539311..6e02db8075e2 100644 --- a/aten/src/ATen/LegacyTHFunctionsCPU.h +++ b/aten/src/ATen/LegacyTHFunctionsCPU.h @@ -38,8 +38,6 @@ Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scala Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max); std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A); std::tuple _th_gels(const Tensor & self, const Tensor & A); -std::tuple _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors); -std::tuple _th_eig(const Tensor & self, bool eigenvectors); Tensor & _th_potri_out(Tensor & output, const Tensor & self, bool upper); Tensor _th_potri(const Tensor & self, bool upper); std::tuple _th_geqrf_out(Tensor & res1, Tensor & res2, const Tensor & self); diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 9cc040b4dc8f..f8191c633d8b 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -78,6 +79,10 @@ extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex *a, extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda, double *w, double *work, int *lwork, int *iwork, int *liwork, int *info); extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda, float *w, float *work, int *lwork, int *iwork, int *liwork, int *info); +// geev +extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info); +extern "C" void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info); + // gesdd extern "C" void zgesdd_(char *jobz, int *m, int *n, std::complex *a, int *lda, double *s, std::complex *u, int *ldu, std::complex *vt, int *ldvt, std::complex *work, int *lwork, double *rwork, int *iwork, int *info); @@ -305,6 +310,14 @@ template<> void lapackSyevd(char jobz, char uplo, int n, float *a, int ld ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork, &liwork, info); } +template<> void lapackEig(char jobvl, char jobvr, int n, double *a, int lda, double *wr, double *wi, double* vl, int ldvl, double *vr, int ldvr, double *work, int lwork, int *info) { + dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info); +} + +template<> void lapackEig(char jobvl, char jobvr, int n, float *a, int lda, float *wr, float *wi, float* vl, int ldvl, float *vr, int ldvr, float *work, int lwork, int *info) { + sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info); +} + template<> void lapackSvd, double>(char jobz, int m, int n, c10::complex *a, int lda, double *s, c10::complex *u, int ldu, c10::complex *vt, int ldvt, c10::complex *work, int lwork, double *rwork, int *iwork, int *info) { zgesdd_(&jobz, &m, &n, reinterpret_cast*>(a), &lda, s, reinterpret_cast*>(u), &ldu, @@ -1155,6 +1168,46 @@ std::tuple symeig_out(Tensor& vals, Tensor& vecs, const Tensor return std::tuple(vals, vecs); } +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ eig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DEFINE_DISPATCH(eig_stub); + +std::tuple eig_out(Tensor& e, Tensor& v, const Tensor& self, bool eigenvectors) { + TORCH_CHECK(self.dim() == 2, "input should be 2 dimensional"); + TORCH_CHECK(self.size(0) == self.size(1), "input should be square"); + TORCH_CHECK(self.isfinite().all().item(), "input should not contain infs or NaNs"); + TORCH_CHECK(e.dtype() == self.dtype(), "Expected 'e' to have dtype ", self.dtype(), " but got ", e.dtype()); + if (eigenvectors) + TORCH_CHECK(v.dtype() == self.dtype(), "Expected 'v' to have dtype ", self.dtype(), " but got ", v.dtype()); + int64_t n = self.size(-1); + + at::native::resize_output(e, {n, 2}); + if (eigenvectors) { + at::native::resize_output(v, self.sizes()); + } + + // optimization: if self is empty, we can immediately return the empty + // tensors, instead of getting empty tensors from eig_helper + if (self.numel() == 0) { + return std::tuple(e, v); + } + + Tensor vals_, vecs_; + std::tie(vals_, vecs_) = eig_stub(self.device().type(), self, eigenvectors); + e.copy_(vals_); + if (eigenvectors) { + v.copy_(vecs_); + } + return std::tuple(e, v); +} + +std::tuple eig(const Tensor& self, bool eigenvectors) { + Tensor e = at::empty({0}, self.options()); + Tensor v = at::empty({0}, self.options()); + at::eig_out(e, v, self, eigenvectors); + return std::tuple(e, v); +} + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ svd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h new file mode 100644 index 000000000000..95fc2c6097ce --- /dev/null +++ b/aten/src/ATen/native/BatchLinearAlgebra.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include + +#include // for USE_LAPACK + + +namespace at { namespace native { + +#ifdef USE_LAPACK +// Define per-batch functions to be used in the implementation of batched +// linear algebra operations + +template +void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info); + +#endif + +using eig_fn = std::tuple (*)(const Tensor&, bool&); + +DECLARE_DISPATCH(eig_fn, eig_stub); + +}} // namespace at::native diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp new file mode 100644 index 000000000000..d251245c60c5 --- /dev/null +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -0,0 +1,77 @@ +#include +#include +#include +#include + +#include // for USE_LAPACK + +namespace at { namespace native { + +namespace { + +template +void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vecs_, int64_t* info_ptr) { +#ifndef USE_LAPACK + TORCH_CHECK(false, "Calling torch.eig on a CPU tensor requires compiling ", + "PyTorch with LAPACK. Please use PyTorch built with LAPACK support."); +#else + char jobvr = eigenvectors ? 'V' : 'N'; + int64_t n = self.size(-1); + auto self_data = self.data_ptr(); + + auto vals_data = vals_.data_ptr(); + scalar_t* wr = vals_data; + scalar_t* wi = vals_data + n; + + scalar_t* vecs_data = eigenvectors ? vecs_.data_ptr() : nullptr; + int ldvr = eigenvectors ? n : 1; + + if (n > 0) { + // call lapackEig once to get the optimal size for work data + scalar_t wkopt; + int info; + lapackEig('N', jobvr, n, self_data, n, wr, wi, + nullptr, 1, vecs_data, ldvr, &wkopt, -1, &info); + int lwork = static_cast(wkopt); + + // call again to do the actual work + Tensor work = at::empty({lwork}, self.dtype()); + lapackEig('N', jobvr, n, self_data, n, wr, wi, + nullptr, 1, vecs_data, ldvr, work.data_ptr(), lwork, &info); + *info_ptr = info; + } +#endif +} + +std::tuple eig_kernel_impl(const Tensor& self, bool& eigenvectors) { + int64_t n = self.size(-1); + // lapackEig function expects the input to be column major, or stride {1, n}, + // so we must set the stride manually since the default stride for tensors is + // row major, {n, 1} + Tensor self_ = at::empty_strided( + {n, n}, + {1, n}, + at::TensorOptions(self.dtype())); + self_.copy_(self); + + auto options = self.options().memory_format(LEGACY_CONTIGUOUS_MEMORY_FORMAT); + Tensor vals_ = at::empty_strided({n, 2}, {1, n}, options); + Tensor vecs_ = eigenvectors + ? at::empty_strided({n, n}, {1, n}, options) + : Tensor(); + + int64_t info; + AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "eig_cpu", [&]{ + apply_eig(self_, eigenvectors, vals_, vecs_, &info); + }); + singleCheckErrors(info, "eig_cpu"); + return std::tuple(vals_, vecs_); +} + +} // anonymous namespace + +REGISTER_ARCH_DISPATCH(eig_stub, DEFAULT, &eig_kernel_impl); +REGISTER_AVX_DISPATCH(eig_stub, &eig_kernel_impl); +REGISTER_AVX2_DISPATCH(eig_stub, &eig_kernel_impl); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index 5b16adaa2e5f..f49ddd288eb7 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -1941,7 +1942,8 @@ TORCH_CHECK(false, "Calling torch.eig on a CUDA tensor requires compiling PyTorc * 2. return CPU tensors (because this is what magmaEig returns), which will be copied to GPU memory * by the caller */ -static std::tuple eig_cuda_helper(const Tensor& self, int64_t n, bool eigenvectors) { +std::tuple eig_kernel_impl(const Tensor& self, bool& eigenvectors) { + int64_t n = self.size(-1); // copy self to pinned CPU memory auto self_working_copy = at::empty_strided( {n, n}, // square matrix @@ -1965,48 +1967,7 @@ static std::tuple eig_cuda_helper(const Tensor& self, int64_t n, return std::tuple(out_eigvals, out_eigvecs); } -std::tuple eig_cuda_out(Tensor& e, Tensor& v, const Tensor& self, bool eigenvectors) { - TORCH_CHECK(self.dim() == 2, "Expected a two-dimensional input but got ", self.dim(), " dimensions"); - TORCH_CHECK(e.dtype() == self.dtype(), "Expected 'e' to have dtype ", self.dtype(), " but got ", e.dtype()); - if (eigenvectors) - TORCH_CHECK(v.dtype() == self.dtype(), "Expected 'v' to have dtype ", self.dtype(), " but got ", v.dtype()); - squareCheckInputs(self); - int64_t n = self.size(-1); - - at::native::resize_output(e, {n, 2}); - if (eigenvectors) { - at::native::resize_output(v, self.sizes()); - } - - // optimization: if self is empty, we can immediately return the empty - // GPU tensors, instead of getting empty CPU tensors from eig_cuda_helper - // and copying them to GPU - if (self.numel() == 0) { - return std::tuple(e, v); - } - - Tensor cpu_vals, cpu_vecs; - std::tie(cpu_vals, cpu_vecs) = eig_cuda_helper(self, n, eigenvectors); - e.copy_(cpu_vals); - if (eigenvectors) { - v.copy_(cpu_vecs); - } - return std::tuple(e, v); -} - -std::tuple eig_cuda(const Tensor& self, bool eigenvectors) { - TORCH_CHECK(self.dim() == 2, "Expected a two-dimensional input but got ", self.dim(), " dimensions"); - squareCheckInputs(self); - int64_t n = self.size(-1); - - Tensor e, v; - e = at::empty({n, 2}, self.options()); - if (eigenvectors) { - v = at::empty({n, n}, self.options()); - } - eig_cuda_out(e, v, self, eigenvectors); - return std::tuple(e, v); -} +REGISTER_DISPATCH(eig_stub, &eig_kernel_impl); // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ syevd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index e39fce8e75aa..cc6b0e30258e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6253,15 +6253,13 @@ - func: eig.e(Tensor self, bool eigenvectors=False, *, Tensor(a!) e, Tensor(b!) v) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) dispatch: - CPU: legacy::cpu::_th_eig_out - CUDA: eig_cuda_out + DefaultBackend: eig_out - func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors) use_c10_dispatcher: full variants: method, function dispatch: - CPU: legacy::cpu::_th_eig - CUDA: eig_cuda + DefaultBackend: eig - func: svd.U(Tensor self, bool some=True, bool compute_uv=True, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) V) dispatch: diff --git a/aten/src/TH/generic/THLapack.cpp b/aten/src/TH/generic/THLapack.cpp index c0fb51f53e45..5b4ef15e7c2c 100644 --- a/aten/src/TH/generic/THLapack.cpp +++ b/aten/src/TH/generic/THLapack.cpp @@ -4,8 +4,6 @@ TH_EXTERNC void dgels_(char *trans, int *m, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, double *work, int *lwork, int *info); TH_EXTERNC void sgels_(char *trans, int *m, int *n, int *nrhs, float *a, int *lda, float *b, int *ldb, float *work, int *lwork, int *info); -TH_EXTERNC void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda, double *wr, double *wi, double* vl, int *ldvl, double *vr, int *ldvr, double *work, int *lwork, int *info); -TH_EXTERNC void sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda, float *wr, float *wi, float* vl, int *ldvl, float *vr, int *ldvr, float *work, int *lwork, int *info); TH_EXTERNC void dpotri_(char *uplo, int *n, double *a, int *lda, int *info); TH_EXTERNC void spotri_(char *uplo, int *n, float *a, int *lda, int *info); TH_EXTERNC void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau, float *work, int *lwork, int *info); @@ -31,21 +29,6 @@ void THLapack_(gels)(char trans, int m, int n, int nrhs, scalar_t *a, int lda, s #endif } -/* Compute for an N-by-N real nonsymmetric matrix A, the eigenvalues and, -optionally, the left and/or right eigenvectors */ -void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info) -{ -#ifdef USE_LAPACK -#if defined(TH_REAL_IS_DOUBLE) - dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info); -#else - sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr, &ldvr, work, &lwork, info); -#endif -#else - THError("geev : Lapack library not found in compile time\n"); -#endif -} - /* Cholesky factorization based Matrix Inverse */ void THLapack_(potri)(char uplo, int n, scalar_t *a, int lda, int *info) { diff --git a/aten/src/TH/generic/THLapack.h b/aten/src/TH/generic/THLapack.h index 287915c74d26..121eee871c67 100644 --- a/aten/src/TH/generic/THLapack.h +++ b/aten/src/TH/generic/THLapack.h @@ -4,8 +4,6 @@ /* ||AX-B|| */ TH_API void THLapack_(gels)(char trans, int m, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, scalar_t *work, int lwork, int *info); -/* Non-sym eigenvals */ -TH_API void THLapack_(geev)(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *wr, scalar_t *wi, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, int *info); /* Positive Definite matrices */ /* Matrix inverse based on Cholesky factorization */ diff --git a/aten/src/TH/generic/THTensorLapack.cpp b/aten/src/TH/generic/THTensorLapack.cpp index 76d7d7bc48d8..e6c200169191 100644 --- a/aten/src/TH/generic/THTensorLapack.cpp +++ b/aten/src/TH/generic/THTensorLapack.cpp @@ -191,88 +191,6 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a) if (free_b) c10::raw::intrusive_ptr::decref(b); } -void THTensor_(geev)(THTensor *re_, THTensor *rv_, THTensor *a_, bool eigenvectors) -{ - char jobvr = eigenvectors ? 'V' : 'N'; - int n, lda, lwork, info, ldvr; - THTensor *work=nullptr, *wi, *wr, *a; - scalar_t wkopt; - scalar_t *rv_data; - int64_t i; - - THTensor *re__ = NULL; - THTensor *rv__ = NULL; - - THArgCheck(a_->dim() == 2, 1, "A should be 2 dimensional"); - THArgCheck(a_->size(0) == a_->size(1), 1,"A should be square"); - THArgCheck(THTensor_(isFinite)(a_), 1, "A should not contain infs or NaNs"); - - /* we want to definitely clone a_ for geev*/ - a = THTensor_(cloneColumnMajor)(NULL, a_); - - n = a->size(0); - lda = n; - - wi = THTensor_(newWithSize1d)(n); - wr = THTensor_(newWithSize1d)(n); - - rv_data = NULL; - ldvr = 1; - if (jobvr == 'V') - { - THTensor_(resize2d)(rv_,n,n); - /* guard against someone passing a correct size, but wrong stride */ - rv__ = THTensor_(newTransposedContiguous)(rv_); - rv_data = rv__->data(); - ldvr = n; - } - THTensor_(resize2d)(re_,n,2); - re__ = THTensor_(newContiguous)(re_); - - if (n > 0) { // lapack doesn't work with size 0 - /* get optimal workspace size */ - THLapack_(geev)('N', jobvr, n, a->data(), lda, wr->data(), wi->data(), - NULL, 1, rv_data, ldvr, &wkopt, -1, &info); - - lwork = (int)wkopt; - work = THTensor_(newWithSize1d)(lwork); - - THLapack_(geev)('N', jobvr, n, a->data(), lda, wr->data(), wi->data(), - NULL, 1, rv_data, ldvr, work->data(), lwork, &info); - - THLapackCheckWithCleanup(" Lapack Error in %s : %d off-diagonal elements of an didn't converge to zero", - THCleanup(c10::raw::intrusive_ptr::decref(re__); - c10::raw::intrusive_ptr::decref(rv__); - c10::raw::intrusive_ptr::decref(a); - c10::raw::intrusive_ptr::decref(wi); - c10::raw::intrusive_ptr::decref(wr); - c10::raw::intrusive_ptr::decref(work);), - "geev", info,""); - } - - { - scalar_t *re_data = re__->data(); - scalar_t *wi_data = wi->data(); - scalar_t *wr_data = wr->data(); - for (i=0; i Date: Thu, 10 Dec 2020 00:10:10 -0800 Subject: [PATCH 101/250] [numpy] `torch.exp{2, m1}`: promote integer inputs to float (#48926) Summary: Reference: https://github.com/pytorch/pytorch/issues/42515 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48926 Reviewed By: zhangguanheng66 Differential Revision: D25392344 Pulled By: mruberry fbshipit-source-id: ddbabcfd58cc4c944153b1a224cc232efa022104 --- aten/src/ATen/native/UnaryOps.cpp | 8 ++-- aten/src/ATen/native/cuda/UnaryOpsKernel.cu | 4 +- test/test_torch.py | 3 -- test/test_unary_ufuncs.py | 1 - torch/csrc/jit/tensorexpr/kernel.cpp | 5 ++- .../_internal/common_methods_invocations.py | 42 ++++++++++--------- 6 files changed, 31 insertions(+), 32 deletions(-) diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 9522d2a1e271..9c91821aed80 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -250,12 +250,12 @@ Tensor& exp_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(r Tensor exp(const Tensor& self) { return unary_op_impl(self, at::exp_out); } Tensor& exp_(Tensor& self) { return unary_op_impl_(self, at::exp_out); } -Tensor& exp2_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, exp2_stub); } -Tensor exp2(const Tensor& self) { return unary_op_impl(self, at::exp2_out); } +Tensor& exp2_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, exp2_stub); } +Tensor exp2(const Tensor& self) { return unary_op_impl_float(self, exp2_stub); } Tensor& exp2_(Tensor& self) { return unary_op_impl_(self, at::exp2_out); } -Tensor& expm1_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, expm1_stub); } -Tensor expm1(const Tensor& self) { return unary_op_impl(self, at::expm1_out); } +Tensor& expm1_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, expm1_stub); } +Tensor expm1(const Tensor& self) { return unary_op_impl_float(self, expm1_stub); } Tensor& expm1_(Tensor& self) { return unary_op_impl_(self, at::expm1_out); } Tensor& erf_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, erf_stub); } diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu index 512154fd02df..4d676181be79 100644 --- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -41,7 +41,7 @@ void exp_kernel_cuda(TensorIterator& iter) { } void exp2_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "exp2_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "exp2_cuda", [&]() { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t a) -> scalar_t { return ::exp2(a); }); @@ -49,7 +49,7 @@ void exp2_kernel_cuda(TensorIterator& iter) { } void expm1_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "expm1_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.common_dtype(), "expm1_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { return ::expm1(a); }); diff --git a/test/test_torch.py b/test/test_torch.py index 6ed3594c3d3b..16e011645899 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -6905,9 +6905,6 @@ def inner(self, device, dtype): ('exp', '', _small_3d, lambda t, d: [], 1e-2, 5e-2, 1e-5, torch.testing.get_all_fp_dtypes()), ('exp', 'small', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-2, 5e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), - ('expm1', '', _small_3d, lambda t, d: [], 1e-2, 1e-2, 1e-5, _float_types), - ('expm1', 'small', lambda t, d: _small_3d(t, d).clamp(-1, 1), - lambda t, d: [], 1e-2, 1e-2, 1e-5, _float_types, [torch.bfloat16]), ('rad2deg', '', _small_3d, lambda t, d: [], 1e-1, 1e-0, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), ('deg2rad', '', _small_3d, lambda t, d: [], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), ('reciprocal', '', _small_3d, lambda t, d: [], 1e-1, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 656845598a49..f08c5341b399 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -1770,7 +1770,6 @@ def _medium_2d(dtype, device): # TODO: all these should be replaced with OpInfos torch_op_tests = [ _TorchMathTestMeta('exp'), - _TorchMathTestMeta('expm1'), _TorchMathTestMeta('floor'), _TorchMathTestMeta('ceil'), _TorchMathTestMeta('rad2deg'), diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index ea28264246ec..2eb2a81b69eb 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -989,8 +989,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { } break; case aten::expm1: { - return computeOneOperand( - "aten_expm1", v, [](const ExprHandle& a) { return expm1(a); }); + return computeOneOperand("aten_expm1", v, [](const ExprHandle& a) { + return expm1(promoteIntegerToFloat(a)); + }); } break; case aten::erf: { diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 9fc31f6caed4..b88dcaaccb33 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -759,10 +759,26 @@ def sample_inputs(self, device, dtype, requires_grad=False): active_if=(IS_MACOS or IS_WINDOWS)), )), UnaryUfuncInfo('exp2', - ref=np.exp2, - dtypes=floating_types_and(torch.half), - dtypesIfCPU=None, - dtypesIfCUDA=None), + ref=np_unary_ufunc_integer_promotion_wrapper(np.exp2), + dtypes=all_types_and(torch.bool, torch.half), + dtypesIfCPU=all_types_and(torch.bool, torch.half), + dtypesIfCUDA=all_types_and(torch.bool, torch.half), + promotes_integers_to_float=True), + UnaryUfuncInfo('expm1', + ref=np_unary_ufunc_integer_promotion_wrapper(np.expm1), + dtypes=all_types_and(torch.bool, torch.half), + dtypesIfCPU=all_types_and(torch.bool, torch.bfloat16), + dtypesIfCUDA=all_types_and(torch.bool, torch.half), + promotes_integers_to_float=True, + assert_autodiffed=True, + skips=( + # Reference: https://github.com/pytorch/pytorch/pull/48926#issuecomment-739734774 + SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', + device_type='cpu', dtypes=[torch.bfloat16]), + # RuntimeError: "isfinite" not implemented for 'BFloat16' + SkipInfo('TestCommon', 'test_variant_consistency_jit', + device_type='cpu', dtypes=[torch.bfloat16]), + )), UnaryUfuncInfo('nan_to_num', ref=np.nan_to_num, dtypes=all_types_and(torch.half, torch.bool), @@ -785,25 +801,13 @@ def sample_inputs(self, device, dtype, requires_grad=False): # Reference: https://github.com/pytorch/pytorch/pull/47293#issuecomment-721774436 SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', dtypes=[torch.bfloat16]), - # RuntimeError: sqrt does not support automatic differentiation for outputs with complex dtype. - SkipInfo('TestGradients', 'test_fn_grad', - dtypes=[torch.cdouble]), - SkipInfo('TestGradients', 'test_fn_gradgrad', - dtypes=[torch.cdouble]), - SkipInfo('TestGradients', 'test_method_grad', - dtypes=[torch.cdouble]), - SkipInfo('TestGradients', 'test_method_gradgrad', - dtypes=[torch.cdouble]), - SkipInfo('TestGradients', 'test_inplace_grad', - dtypes=[torch.cdouble]), - SkipInfo('TestGradients', 'test_inplace_gradgrad', - dtypes=[torch.cdouble]), SkipInfo('TestCommon', 'test_variant_consistency_eager', dtypes=[torch.cfloat, torch.cdouble]), SkipInfo('TestCommon', 'test_variant_consistency_jit', dtypes=[torch.cfloat, torch.cdouble])), promotes_integers_to_float=True, - handles_complex_extremals=False), + handles_complex_extremals=False, + test_complex_grad=False), ] if TEST_SCIPY: @@ -1124,8 +1128,6 @@ def method_tests(): ('expand_as', (S, 1, 1), (torch.rand(S, S, S),), '', (False,)), ('exp', (S, S, S), NO_ARGS, '', (True,)), ('exp', (), NO_ARGS, 'scalar', (True,)), - ('expm1', (S, S, S), NO_ARGS, '', (True,)), - ('expm1', (), NO_ARGS, 'scalar', (True,)), ('erfinv', torch.rand(S, S, S).clamp(-0.9, 0.9), NO_ARGS), ('erfinv', normal_scalar_clamp(-0.9, 0.9, requires_grad=True), NO_ARGS, 'scalar'), ('logit', torch.randn(S, S, S).clamp(0.1, 0.9).requires_grad_(True), NO_ARGS, ''), From 7f7f0fa335fd8edeabb6ad6d4c97586885345bd5 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 102/250] Avoid using FutureNCCL before it's ready (#48561) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48561 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- WorkNCCL allows to extract a FutureNCCL through getFuture(). There is one instance of this method being called by ProcessGroupNCCL itself, in order to attach a callback to it. This was happening _before_ the work was actually launched, however FutureNCCL does _always_ invoke its callbacks immediately inline. The events that the FutureNCCL was using hadn't been recorded yet, thus blocking on them was a no-op. Moreover, the function that was being called was installed by the generic ProcessGroup superclass, which is not CUDA-aware, and thus probably didn't make any use of the CUDA events or streams. https://github.com/pytorch/pytorch/blob/383abf1f0c1f74e0f471d47e505895d1b0e6bb20/torch/lib/c10d/ProcessGroup.cpp#L66 In short: I believe that creating a FutureNCCL and attaching a callback was equivalent to just invoking that function directly, without any CUDA-specific thing. I'm thus converting the code to do just that, in order to simplify it. Note that, given the comment, I don't think this was the original intention of that code. It seems that the function was intended to be run once the work finished. However, I am not familiar with this code, and I don't want to introduce any functional changes. ghstack-source-id: 118180037 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25210337 fbshipit-source-id: 54033c814ac77641cbbe79b4d01686dfc2b45495 --- torch/lib/c10d/ProcessGroupNCCL.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 1aa161511b01..4b5fb5f59aae 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -462,7 +462,7 @@ ProcessGroupNCCL::ProcessGroupNCCL( if (blockingWait_ && asyncErrorHandling_) { LOG(INFO) << "[Rank " << rank_ << "] NCCL_BLOCKING_WAIT and NCCL_ASYNC_ERROR_HANDLING " - << "should not both be enabled. " + << "should not both be enabled. " << "Only NCCL_BLOCKING_WAIT is being used in this process."; asyncErrorHandling_ = false; } @@ -1073,15 +1073,13 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( if (work->recordFunctionEndCallback_) { // recordFunctionEndCallback_ is normally called in fininsh() function by - // base class, but since finish is not called by WorkNCCL, we schedule this - // function to be run when work is done. + // base class, but since finish is not called by WorkNCCL, we run this + // function now. // Note when can_profile is false, profilingTitle is not provided and so, // recordFunctionEndCallback_ is not set. - work->getFuture()->addCallback(std::move(work->recordFunctionEndCallback_)); + work->recordFunctionEndCallback_(); } - - at::cuda::OptionalCUDAGuard gpuGuard; pre(ncclStreams_[key]); From b7f5aa9890ea06ece779153614132e69e20313f0 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 103/250] Remove NCCL dependency from PythonFutureWrapper (#48495) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48495 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- PythonFutureWrapper needs to provide a GIL-aware way to extract tensors from an IValue of type PyObject. Since this was only used by FutureNCCL it was guarded by #ifdef USE_C10D_NCCL. However, we will need to use it with CUDA-aware futures other than the NCCL one. This might have been achieved simply by replacing USE_C10D_NCCL with USE_CUDA, but I wanted to clean this up better. We're dealing with two independent dimensions: C++-vs-Python and CPU-vs-CUDA. To make the code more modular, the two dimensions should be dealt with by orthogonal solutions: the user setting a custom callback to handle Python, and the subclass being CUDA-aware. Mixing these two axes makes it more complicated. Another reason for changing how this works is that later on, when we'll introduce multi-device support, we'll need to extract dataptrs for other reasons too (rather than just recording streams with the caching allocator), namely to inspect the value to determine which devices it resides on. ghstack-source-id: 118180038 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25177560 fbshipit-source-id: 3a424610c1ea191e8371ffee0a26d62639895884 --- aten/src/ATen/core/ivalue_inl.h | 34 +++++++++++++++---- torch/csrc/jit/python/pybind_utils.h | 45 +++++++++++------------- torch/lib/c10d/ProcessGroupNCCL.hpp | 51 +++++++++++++--------------- 3 files changed, 71 insertions(+), 59 deletions(-) diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 3068bda5f5a5..32ff811f05bd 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -417,12 +417,34 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { return fut; } - // Since this file cannot import CUDA depedency, the type of the seocond arg - // in the callback is c10::Stream instead of at::cuda::CUDAStream, and - // CUDAStream is constructed on the fly. The default implementation - // is a no-op, since it does not deal with any CUDA streams. - virtual void setRecordStreamCallback( - std::function record_stream_cb) {} + // Some subclasses deal with CUDA tensors and must inform the CUDA caching + // allocator of which CUDA streams each DataPtr is used in. If the value held + // by the future is a Python object we need to acquire the GIL when extracting + // these DataPtrs. Since this file cannot depend on Python, we allow users to + // provide a "custom" extractor. Look for example at the PythonFutureWrapper. + using DataPtrExtractor = + std::function>( + const at::IValue&)>; + virtual void setDataPtrExtractor(DataPtrExtractor data_ptr_extractor) {} + + // Expose the default implementation so that external ones can defer to it. + static std::vector> + defaultDataPtrExtractor(const at::IValue& value) { + // FIXME Should we support more types than just tensors and tensor lists? + TORCH_INTERNAL_ASSERT( + value.isTensorList() || value.isTensor(), + "the future value must be either a tensor list or a tensor."); + at::Tensor tensor; + if (value.isTensorList()) { + const auto tensors = value.toTensorVector(); + TORCH_INTERNAL_ASSERT(tensors.size() == 1, "expected exactly 1 tensor"); + tensor = tensors[0]; + } else { + tensor = value.toTensor(); + } + + return {tensor.storage().data_ptr()}; + }; // Tries to retrieve the error message from std::exception_ptr. std::string tryRetrieveErrorMessage() { diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 99b439aa185f..ab77726c11af 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -119,32 +119,7 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper // vector, but Future does not acquire GIL on destruction. auto pf = std::make_shared(std::move(cb)); -#ifdef USE_C10D_NCCL - // This callback is only used by NCCL backend, so skip this code on other - // backends and avoid importing cuda dependency. - // By default, assume that the input value is or can be casted into a tensor - // vector that has exactly one tensor. - auto record_stream_cb = [](const at::IValue& value, - const c10::Stream& stream) { - if (value.isTensorList() || value.isPyObject()) { - std::vector tensors; - if (value.isTensorList()) { - tensors = value.toTensorVector(); - } else { - pybind11::gil_scoped_acquire gil; - py::object obj = torch::jit::toPyObject(value); - tensors = torch::jit::toIValue( - obj, c10::ListType::create(c10::TensorType::get())) - .toTensorVector(); - } - TORCH_INTERNAL_ASSERT(tensors.size() == 1, "expected exactly 1 tensor"); - at::cuda::CUDAStream cuda_stream(stream); - c10::cuda::CUDACachingAllocator::recordStream( - tensors[0].storage().data_ptr(), cuda_stream); - } - }; - fut->setRecordStreamCallback(record_stream_cb); -#endif + fut->setDataPtrExtractor(&PythonFutureWrapper::dataPtrExtractor); return std::make_shared(fut->then( // Capture a copy of the ivalue::Future instead of the `this` pointer @@ -241,6 +216,24 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper std::shared_ptr getPtr() { return shared_from_this(); } + + // This callback is only used by subclasses of Future that deal with CUDA, + // in order to register the pointers on the right streams with the caching + // allocator. + // By default, assume that the input value is or can be casted into a tensor + // vector that has exactly one tensor. + static std::vector> dataPtrExtractor( + const at::IValue& value) { + if (value.isPyObject()) { + pybind11::gil_scoped_acquire gil; + py::object obj = torch::jit::toPyObject(value); + // FIXME Should we support more types than just tensor lists? + auto new_value = torch::jit::toIValue( + obj, c10::ListType::create(c10::TensorType::get())); + return at::ivalue::Future::defaultDataPtrExtractor(new_value); + } + return at::ivalue::Future::defaultDataPtrExtractor(value); + }; }; // error reporting: when reporting user-caused errors, these functions should diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index fd57f105df0b..7f202cdd80c0 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -314,28 +314,9 @@ class ProcessGroupNCCL : public ProcessGroup { // Do not free the underlying data storage of value_ before its // usage on futureNCCLCallbackStream_ finish. - if (record_stream_cb_ != nullptr) { - // If a Python communication hook is used, record_stream_cb_ will be - // set in torch/csrc/jit/python/pybind_utils.h, which allows Python - // dependency to be imported. - record_stream_cb_(value_, futureNCCLCallbackStream_->unwrap()); - } else { - // If a C++ communication hook is used, create and set a record stream - // callback. - TORCH_INTERNAL_ASSERT( - value_.isTensorList() || value_.isTensor(), - "the future value must be either a tensor list or a tensor."); - at::Tensor tensor; - if (value_.isTensorList()) { - const auto tensors = value_.toTensorVector(); - TORCH_INTERNAL_ASSERT( - tensors.size() == 1, "expected exactly 1 tensor"); - tensor = tensors[0]; - } else { - tensor = value_.toTensor(); - } + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { c10::cuda::CUDACachingAllocator::recordStream( - tensor.storage().data_ptr(), *futureNCCLCallbackStream_); + data_ptr, *futureNCCLCallbackStream_); } // Use the dedicated callback stream to run callback. @@ -372,10 +353,9 @@ class ProcessGroupNCCL : public ProcessGroup { return !value_.isNone(); } - void setRecordStreamCallback( - std::function - record_stream_cb) override { - record_stream_cb_ = std::move(record_stream_cb); + void setDataPtrExtractor(DataPtrExtractor dataPtrExtractor) override { + std::unique_lock lock(dataPtrExtractorMutex_); + dataPtrExtractor_ = std::move(dataPtrExtractor); } private: @@ -383,9 +363,26 @@ class ProcessGroupNCCL : public ProcessGroup { c10::DeviceIndex deviceIndex_; std::shared_ptr> cudaEvents_; std::shared_ptr futureNCCLCallbackStream_; - std::function - record_stream_cb_; + DataPtrExtractor dataPtrExtractor_; + std::mutex dataPtrExtractorMutex_; c10::optional error_; + + std::vector> extractDataPtrs( + const at::IValue& value) { + std::unique_lock lock(dataPtrExtractorMutex_); + std::vector> data_ptrs; + if (dataPtrExtractor_ != nullptr) { + // If a Python communication hook is used, dataPtrExtractor_ will be + // set in torch/csrc/jit/python/pybind_utils.h, which allows Python + // dependency to be imported. + data_ptrs = dataPtrExtractor_(value); + } else { + // If a C++ communication hook is used, use the default extractor. + data_ptrs = at::ivalue::Future::defaultDataPtrExtractor(value); + } + TORCH_INTERNAL_ASSERT(data_ptrs.size() == 1, "expected exactly 1 tensor"); + return data_ptrs; + } }; // If you wish to create multiple process groups, each with a potentially From 868a1a48c68114c53d4e920dead36d3f401d4440 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 104/250] Add some safeguards to FutureNCCL (#48562) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48562 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- In this commit I'm adding a few asserts to the constructors of FutureNCCL to make sure that what's passed in is what we expect (fun fact: until two commits ago that wasn't the case, as we were passed some empty events). I'm also making the second constructor private, as it's only supposed to be used by the then() method. ghstack-source-id: 118180036 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25210333 fbshipit-source-id: d2eacf0f7de5cc763e3cdd1ae5fd521fd2eec317 --- torch/lib/c10d/ProcessGroupNCCL.hpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 7f202cdd80c0..f9c4dadc578a 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -231,8 +231,16 @@ class ProcessGroupNCCL : public ProcessGroup { TORCH_INTERNAL_ASSERT( cudaEvents_->size() == 1, "FutureNCCL only supports single-process single-device mode."); + for (const at::cuda::CUDAEvent& event : *cudaEvents_) { + TORCH_INTERNAL_ASSERT(event.isCreated()); + TORCH_INTERNAL_ASSERT(event.device_index() == deviceIndex_); + } + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + TORCH_INTERNAL_ASSERT(data_ptr.device().index() == deviceIndex_); + } } + private: // This constructor is used by then callback, it skips setting the value at // the beginning. Later, the value will be set using markCompleted with the // return value of callback. @@ -248,7 +256,12 @@ class ProcessGroupNCCL : public ProcessGroup { cudaEvents_->size() == 1, "FutureNCCL only supports single-process single-device mode."); } + // We need this because it will be the ::make() static method that actually + // creates the instance. This is a brittle approach and the passkey idiom + // would be a more robust solution. However, this will go away in #48505. + friend c10::intrusive_ptr; + public: // Gets the current stream of the device and synchronizes recorded streams // with that. It will return after synchronizing the correct GPU streams to // ensure we can have async CUDA execution and it does not wait for the @@ -270,6 +283,9 @@ class ProcessGroupNCCL : public ProcessGroup { "Attempting to set value of a FutureNCCL which has a value." "FutureNCCL's value was internally set to NCCL collective's " "outputs or the return value of the callback."); + for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + TORCH_INTERNAL_ASSERT(data_ptr.device().index() == deviceIndex_); + } value_ = std::move(value); } @@ -311,6 +327,13 @@ class ProcessGroupNCCL : public ProcessGroup { // Create a FutureNCCL without setting a value. auto fut = c10::make_intrusive( deviceIndex_, thenFutCudaEvents, futureNCCLCallbackStream_); + // The new future needs the DataPtr extractor when it gets marked complete + // but this might happen immediately inline or in parallel by another + // thread. In both these cases this would/might happen before the user has + // time to set their own DataPtr extractor, which might lead to failures + // if the default extractor can't handle some of the user's types. + // Therefore we propagate our extractor. + fut->setDataPtrExtractor(dataPtrExtractor_); // Do not free the underlying data storage of value_ before its // usage on futureNCCLCallbackStream_ finish. From e4267eb424ba9a97a945a5b3ed9e3a95d85cd002 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 105/250] Have FutureNCCL record streams w/ allocator in addCallback (#48496) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48496 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- There are two ways to add a callback to a Future: `then` and `addCallback` (with the former deferring to the latter). FutureNCCL only "patched" `then`, which caused `addCallback` to be unsupported. By patching `addCallback`, on the other hand, we cover both. The high-level goal of this change though is to remove all CUDA-specific stuff from `then`, and move it to either `markCompleted` or to a wrapper around the callback. This will take a few more steps to achieve. ghstack-source-id: 118180031 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25177558 fbshipit-source-id: ee0ad24eb2e56494c353db700319858ef9dcf32b --- torch/lib/c10d/ProcessGroupNCCL.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index f9c4dadc578a..e64f735ef63b 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -307,7 +307,15 @@ class ProcessGroupNCCL : public ProcessGroup { // this callback. This new FutureNCCL's cudaEvents will record the // callback's stream and will have the result value of the callback. void addCallback(std::function callback) override { + // Do not free the underlying data storage of value_ before its + // usage on futureNCCLCallbackStream_ finish. + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, *futureNCCLCallbackStream_); + } + (*cudaEvents_)[0].block(*futureNCCLCallbackStream_); + // Use the dedicated callback stream to run callback. c10::OptionalStreamGuard streamGuard{ c10::Stream(*futureNCCLCallbackStream_)}; callback(); @@ -335,14 +343,6 @@ class ProcessGroupNCCL : public ProcessGroup { // Therefore we propagate our extractor. fut->setDataPtrExtractor(dataPtrExtractor_); - // Do not free the underlying data storage of value_ before its - // usage on futureNCCLCallbackStream_ finish. - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - c10::cuda::CUDACachingAllocator::recordStream( - data_ptr, *futureNCCLCallbackStream_); - } - - // Use the dedicated callback stream to run callback. // Cannot move capture std::function in lambda, because it cannot deduce // the template type for std::function. Hence use std::bind to explicitly // specify types. From 8fb52e7fa2f7b50846e4dd8ca6e06028d5538457 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 106/250] Make FutureNCCL record events in current stream (#48497) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48497 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- When we record the events to mark a "follow-up" future complete (for a callback), we used to record them onto the dedicated stream, but that streams is the current stream at that time, so instead we could just record them onto the current stream. This introduces no functional differences. The reason I'm adding such an additional layer of indirection is so that the dedicated stream is only referenced inside the `addCallback` method, which will later allow us to more easily change how that stream works. ghstack-source-id: 118180035 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25177553 fbshipit-source-id: c6373eddd34bd399df09fd4861915bf98fd50681 --- torch/lib/c10d/ProcessGroupNCCL.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index e64f735ef63b..fe6c113e9f22 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -316,8 +316,7 @@ class ProcessGroupNCCL : public ProcessGroup { (*cudaEvents_)[0].block(*futureNCCLCallbackStream_); // Use the dedicated callback stream to run callback. - c10::OptionalStreamGuard streamGuard{ - c10::Stream(*futureNCCLCallbackStream_)}; + c10::StreamGuard streamGuard{*futureNCCLCallbackStream_}; callback(); } @@ -352,7 +351,8 @@ class ProcessGroupNCCL : public ProcessGroup { fut->markCompleted(at::IValue(cb())); // In case of chained then callback calls, thenFutCudaEvents // records callback's stream. - (*thenFutCudaEvents)[0].record(*futureNCCLCallbackStream_); + (*thenFutCudaEvents)[0].record( + at::cuda::getCurrentCUDAStream(deviceIndex_)); } catch (const std::exception& e) { fut->setError(std::current_exception()); } From 6157f8aeb5bb8e4ab103c8afd0ec45563c66d3d2 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 107/250] Use fresh stream from pool for each FutureNCCL callback (#48498) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48498 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- FutureNCCL has a dedicated CUDA stream that it sets as current when running callbacks. This stream is initialized by the ProcessGroupNCCL by extracting it from the global ATen pool. In order to decouple FutureNCCL from that specific ProcessGroup and make it more generic, in this commit we make FutureNCCL extract a fresh stream from the ATen pool each time it needs one. This introduces a functional change, because it removes the implicit synchronization and ordering between the callbacks of a same Future. In fact, such an ordering is hard to guarantee in the general case as, for example, a user could attach a new callback just after the future becomes completed, and thus that callback would be run inline, immediately, out-of-order wrt the other callbacks. (There are ways to "fix" this but they are complicated). NCCL got around this because its futures are already marked complete when they're returned, but in fact it could also run into issues if multiple threads were adding callbacks simultaneously. Note that it remains still possible to enforce ordering between callbacks, but one must now do so explicitly. Namely, instead of this: ``` fut.then(cb1) fut.then(cb2) ``` one must now do: ``` fut.then(cb1).then(cb2) ``` ghstack-source-id: 118180029 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25177559 fbshipit-source-id: 4d4e73ea7bda0ea65066548109b9ea6d5b465599 --- torch/lib/c10d/ProcessGroupNCCL.cpp | 21 ++--------- torch/lib/c10d/ProcessGroupNCCL.hpp | 57 +++++++++-------------------- 2 files changed, 21 insertions(+), 57 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 4b5fb5f59aae..18948c80d242 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -452,7 +452,6 @@ ProcessGroupNCCL::ProcessGroupNCCL( ncclCommCounter_(0), terminateProcessGroup_(false), opTimeout_(options->opTimeout), - futureNCCLCallbackStreams_(c10::cuda::device_count()), isHighPriorityStream_(options->isHighPriorityStream) { TORCH_CHECK(at::cuda::getNumGPUs() != 0, "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"); @@ -867,15 +866,6 @@ std::vector>& ProcessGroupNCCL::getNCCLComm( // Creates the NCCL streams streamVal.push_back(at::cuda::getStreamFromPool(isHighPriorityStream_)); - - // If not set before, get a dedicated stream for the device to run - // FutureNCCL then callbacks. - std::lock_guard lock(mutex_); - if (futureNCCLCallbackStreams_[deviceIndex] == nullptr) { - futureNCCLCallbackStreams_[deviceIndex] = - std::make_shared( - at::cuda::getStreamFromPool(isHighPriorityStream_)); - } } // [Note 2 ] @@ -1027,8 +1017,7 @@ c10::intrusive_ptr ProcessGroupNCCL::WorkNCCL:: return c10::make_intrusive( at::IValue(*outputs_), deviceIndex, - cudaEvents_, - futureNCCLCallbackStreams_[deviceIndex]); + cudaEvents_); } void ProcessGroupNCCL::workEnqueue( @@ -1066,10 +1055,8 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( bool can_profile = outputs.size() == 1; auto work = initWork(devices, rank_, opType, can_profile ? profilingTitle : nullptr); - // Store references to outputs and futureNCCLCallbackStream to be used by - // WorkNCCL::getFuture. + // Store references to outputs to be used by WorkNCCL::getFuture. work->outputs_ = std::make_shared>(outputs); - work->futureNCCLCallbackStreams_ = futureNCCLCallbackStreams_; if (work->recordFunctionEndCallback_) { // recordFunctionEndCallback_ is normally called in fininsh() function by @@ -1152,10 +1139,8 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( auto work = initWork(devices, rank_, opType); if (opType == OpType::RECV) { - // Store references to outputs and futureNCCLCallbackStream to be used by - // WorkNCCL::getFuture. + // Store references to outputs to be used by WorkNCCL::getFuture. work->outputs_ = std::make_shared>(tensors); - work->futureNCCLCallbackStreams_ = futureNCCLCallbackStreams_; } at::cuda::OptionalCUDAGuard gpuGuard; diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index fe6c113e9f22..c1b67cfe2af1 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -171,10 +171,6 @@ class ProcessGroupNCCL : public ProcessGroup { // Store a reference to NCCL collective's outputs to be used by getFuture. std::shared_ptr> outputs_; - // Store streams that run FutureNCCL then callbacks. - std::vector> - futureNCCLCallbackStreams_; - friend class ProcessGroupNCCL; }; @@ -200,10 +196,8 @@ class ProcessGroupNCCL : public ProcessGroup { // or NCCL's barrier(). // // If created by WorkNCCL's getFuture API, FutureNCCL has a reference to - // WorkNCCL's cudaEvents, NCCL collective's outputs, device index of - // outputs' device, and the ProcesGroupNCCL's dedicated - // futureNCCLCallbackStream for outputs' device that runs all the then - // callbacks called from this FutureNCCL. Its value is NCCL collective's + // WorkNCCL's cudaEvents, NCCL collective's outputs, and the device index of + // outputs' device. Its value is NCCL collective's // outputs. FutureNCCL only supports single-process single-device mode where // the size of outputs is equal to 1. // @@ -213,21 +207,17 @@ class ProcessGroupNCCL : public ProcessGroup { // own cudaEvents with the stream that runs the callback. This design // enables synchronizing the appropriate streams and avoids stalling PyTorch's // default stream while running the callback. In case of multiple then - // callbacks, the design will work like a chain such that FutureNCCL n will - // wait on the cudaEvents from FutureNCCL n - 1. All callbacks are executed on - // outputs' device's dedicated futureNCCLCallbackStream. + // callbacks, each will be executed on its own fresh stream. struct FutureNCCL : at::ivalue::Future { public: explicit FutureNCCL( at::IValue value, c10::DeviceIndex deviceIndex, - std::shared_ptr> cudaEvents, - std::shared_ptr futureNCCLCallbackStream) + std::shared_ptr> cudaEvents) : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), value_(std::move(value)), deviceIndex_(deviceIndex), - cudaEvents_(cudaEvents), - futureNCCLCallbackStream_(futureNCCLCallbackStream) { + cudaEvents_(std::move(cudaEvents)) { TORCH_INTERNAL_ASSERT( cudaEvents_->size() == 1, "FutureNCCL only supports single-process single-device mode."); @@ -246,12 +236,10 @@ class ProcessGroupNCCL : public ProcessGroup { // return value of callback. explicit FutureNCCL( c10::DeviceIndex deviceIndex, - std::shared_ptr> cudaEvents, - std::shared_ptr futureNCCLCallbackStream) + std::shared_ptr> cudaEvents) : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), deviceIndex_(deviceIndex), - cudaEvents_(cudaEvents), - futureNCCLCallbackStream_(futureNCCLCallbackStream) { + cudaEvents_(std::move(cudaEvents)) { TORCH_INTERNAL_ASSERT( cudaEvents_->size() == 1, "FutureNCCL only supports single-process single-device mode."); @@ -307,16 +295,19 @@ class ProcessGroupNCCL : public ProcessGroup { // this callback. This new FutureNCCL's cudaEvents will record the // callback's stream and will have the result value of the callback. void addCallback(std::function callback) override { + // FIXME Should we find a way to allow to change the priority of streams? + at::cuda::CUDAStream stream = + at::cuda::getStreamFromPool(/*isHighPriority=*/false, deviceIndex_); + // Do not free the underlying data storage of value_ before its - // usage on futureNCCLCallbackStream_ finish. + // usage on the stream finishes. for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - c10::cuda::CUDACachingAllocator::recordStream( - data_ptr, *futureNCCLCallbackStream_); + c10::cuda::CUDACachingAllocator::recordStream(data_ptr, stream); } - (*cudaEvents_)[0].block(*futureNCCLCallbackStream_); + (*cudaEvents_)[0].block(stream); // Use the dedicated callback stream to run callback. - c10::StreamGuard streamGuard{*futureNCCLCallbackStream_}; + c10::StreamGuard streamGuard{stream}; callback(); } @@ -326,14 +317,13 @@ class ProcessGroupNCCL : public ProcessGroup { c10::intrusive_ptr then( std::function callback, at::TypePtr /* unused */) override { - // Create a new cudaEvents object of size 1 that will record - // futureNCCLCallbackStream_ after callback and will be passed to the new - // FutureNCCL. + // Create a new cudaEvents object of size 1 that will record the current + // stream after callback and will be passed to the new FutureNCCL. auto thenFutCudaEvents = std::make_shared>(1); // Create a FutureNCCL without setting a value. auto fut = c10::make_intrusive( - deviceIndex_, thenFutCudaEvents, futureNCCLCallbackStream_); + deviceIndex_, thenFutCudaEvents); // The new future needs the DataPtr extractor when it gets marked complete // but this might happen immediately inline or in parallel by another // thread. In both these cases this would/might happen before the user has @@ -385,7 +375,6 @@ class ProcessGroupNCCL : public ProcessGroup { at::IValue value_; c10::DeviceIndex deviceIndex_; std::shared_ptr> cudaEvents_; - std::shared_ptr futureNCCLCallbackStream_; DataPtrExtractor dataPtrExtractor_; std::mutex dataPtrExtractorMutex_; c10::optional error_; @@ -743,16 +732,6 @@ class ProcessGroupNCCL : public ProcessGroup { // set contains the string representation of ncclUniqueId. std::unordered_set abortedComms_; - // In single-process single-device mode, WorkNCCL::getFuture is supported. - // Depending on the device index of collective outputs, WorkNCCL will pass - // the corresponding device's then callback stream to FutureNCCL. - // We just inititalize futureNCCLCallbackStreams_ inside the constructor and - // set its size to the total number of available devices and depending on the - // device of the NCCL collective's outputs, we later set the callback stream - // of the corresponding device inside ProcessGroupNCCL::getNCCLComm if not set - // before. - std::vector> futureNCCLCallbackStreams_; - // Schedule NCCL operations on high priority CUDA streams. bool isHighPriorityStream_ = false; From b91b0872a18cdf4e898e4a330d2a17ab9836c1b4 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 108/250] Record CUDA events for "follow-up" FutureNCCL inside markCompleted (#48499) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48499 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- We can merge and "hide" a whole bunch of CUDA-related logic if we store and record the CUDA events that correspond to the completion of a FutureNCCL when we call markCompleted (rather than splitting it between the constructor, the `then` method, and a wrapper around the callback). A more concrete reason for this change is that soon I'll add support for multi-device, and in that case we can't necessarily know in advance which devices a value will be on until we get that value (and we don't want to record an event on all devices as then we might "over-synchronize"). To me, this also makes more conceptual sense: the moment when we store a value on the future, which is the "signal" that the future is now ready, should also be time at which we record the events needed to synchronize with that value. Though this may just be personal preference. ghstack-source-id: 118180034 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25177557 fbshipit-source-id: 53d4bcdfb89fa0d11bb7b1b94db5d652edeb3b7b --- torch/lib/c10d/ProcessGroupNCCL.hpp | 34 ++++++++++------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index c1b67cfe2af1..383fc21fa0d8 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -231,19 +231,9 @@ class ProcessGroupNCCL : public ProcessGroup { } private: - // This constructor is used by then callback, it skips setting the value at - // the beginning. Later, the value will be set using markCompleted with the - // return value of callback. - explicit FutureNCCL( - c10::DeviceIndex deviceIndex, - std::shared_ptr> cudaEvents) + explicit FutureNCCL(c10::DeviceIndex deviceIndex) : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), - deviceIndex_(deviceIndex), - cudaEvents_(std::move(cudaEvents)) { - TORCH_INTERNAL_ASSERT( - cudaEvents_->size() == 1, - "FutureNCCL only supports single-process single-device mode."); - } + deviceIndex_(deviceIndex) {} // We need this because it will be the ::make() static method that actually // creates the instance. This is a brittle approach and the passkey idiom // would be a more robust solution. However, this will go away in #48505. @@ -275,6 +265,14 @@ class ProcessGroupNCCL : public ProcessGroup { TORCH_INTERNAL_ASSERT(data_ptr.device().index() == deviceIndex_); } value_ = std::move(value); + + TORCH_INTERNAL_ASSERT(cudaEvents_ == nullptr); + // Create a new cudaEvents object of size 1 that will record the current + // stream after callback and will be passed to the new FutureNCCL. + cudaEvents_ = std::make_shared>(1); + // In case of chained then callback calls, cudaEvents + // records callback's stream. + (*cudaEvents_)[0].record(at::cuda::getCurrentCUDAStream(deviceIndex_)); } // Just returns FutureNCCL's value after wait returns. @@ -317,13 +315,7 @@ class ProcessGroupNCCL : public ProcessGroup { c10::intrusive_ptr then( std::function callback, at::TypePtr /* unused */) override { - // Create a new cudaEvents object of size 1 that will record the current - // stream after callback and will be passed to the new FutureNCCL. - auto thenFutCudaEvents = - std::make_shared>(1); - // Create a FutureNCCL without setting a value. - auto fut = c10::make_intrusive( - deviceIndex_, thenFutCudaEvents); + auto fut = c10::make_intrusive(deviceIndex_); // The new future needs the DataPtr extractor when it gets marked complete // but this might happen immediately inline or in parallel by another // thread. In both these cases this would/might happen before the user has @@ -339,10 +331,6 @@ class ProcessGroupNCCL : public ProcessGroup { [&](std::function cb) { try { fut->markCompleted(at::IValue(cb())); - // In case of chained then callback calls, thenFutCudaEvents - // records callback's stream. - (*thenFutCudaEvents)[0].record( - at::cuda::getCurrentCUDAStream(deviceIndex_)); } catch (const std::exception& e) { fut->setError(std::current_exception()); } From 003c30ba824861603b2ea99395771af995c51eef Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 109/250] Fix FutureNCCL's completed() disagreeing with wait() (#48503) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48503 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- My impression is that one property of the upstream Future class is that once .wait() returns, or once a callback is invoked, then .completed() should return True. This was not the case for FutureNCCL because .wait() would return immediately, and callbacks would be invoked inline, but .completed() could return False if the CUDA async operations hadn't completed yet. That was odd and confusing. Since there are other ways for users to check the status of CUDA operations (if they really need, and typically I don't think it's so common), perhaps it's best to avoid checking the status of CUDA events in .completed(). ghstack-source-id: 118180028 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25180531 fbshipit-source-id: e1207f6b91f010f278923cc5fec1190d0fcdab30 --- torch/lib/c10d/ProcessGroupNCCL.hpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 383fc21fa0d8..8f19cb280d82 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -339,15 +339,8 @@ class ProcessGroupNCCL : public ProcessGroup { return fut; } - // Checks cudaEventQuery with cudaEvents. Returns true if a FutureError was - // recorded or the entire operation is completed on the GPU. bool completed() const override { - if (error_) { - return true; - } - // Checking the work's corresponding CUDA events' status - auto ret = cudaEventQuery((*cudaEvents_)[0]); - return ret != cudaErrorNotReady || ret == cudaSuccess; + return true; } bool hasValue() const override { From 91ad3ed8312ec84ed6487725b75f190deb3fdf7e Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 110/250] Fix FutureNCCL not recording dataptrs with caching alloc in wait() (#48563) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48563 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- The CUDA caching allocator requires us to register all streams in which a DataPtr is used. We already do so when we invoke a callback, for which we obtain streams from the ATen pool. However, we didn't do so when the user waits for the Future and then uses the results in their current streams. This was probably fine in most cases, because the outputs of the NCCL ops (which is the tensors we're dealing with here) were user-provided, and thus already registered in some user streams, but in principle the user could use different streams when waiting than the ones they used to create the tensors. (If they use the same streams, registering becomes a no-op). But, more importantly, this change will help us turn FutureNCCL into a more general-purpose class as for example in RPC the tensors of the result are allocated by PyTorch itself and thus we need to record their usage on the user's streams with the caching allocator. ghstack-source-id: 118180033 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25210338 fbshipit-source-id: e0a4ba157653b74dd84cf5665c992ccce2dea188 --- torch/lib/c10d/ProcessGroupNCCL.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 8f19cb280d82..9b1145854374 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -250,6 +250,10 @@ class ProcessGroupNCCL : public ProcessGroup { } auto stream = at::cuda::getCurrentCUDAStream(deviceIndex_); (*cudaEvents_)[0].block(stream); + + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + c10::cuda::CUDACachingAllocator::recordStream(data_ptr, stream); + } } // If FutureNCCL was created by FutureNCCL::then, its value would be empty From e294c2d841f77719b1e55e810d4c5ae0a64f272e Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 111/250] Add multi-GPU support to FutureNCCL (#48500) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48500 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- After the previous changes, this is now much simpler than it sounds. For the most part it just consists in repeating some operations multiple times, once for device (e.g., recording and blocking on events). Funnily, we already had a vector of events, even though we only ever stored one element in it (this probably comes from the fact that this is shared with WorkNCCL, which can hold more than one event). Here, we now also store a vector of device indices. Perhaps the only non-trivial part of this is that now, for "follow-up" Futures (for callbacks), we can't know in advance which device the result will be on so we must determine it dynamically when we receive the result, by inspecting it. That's also easier than it sound because we already have a dataptr extractor. ghstack-source-id: 118180022 Test Plan: Unit tests (I should probably add new ones) Reviewed By: mrshenli Differential Revision: D25177556 fbshipit-source-id: 41ef39ec0dc458e341aa1564f2b9f2b573d7fa9f --- torch/lib/c10d/ProcessGroupNCCL.cpp | 14 ++-- torch/lib/c10d/ProcessGroupNCCL.hpp | 116 ++++++++++++++++++++-------- 2 files changed, 89 insertions(+), 41 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 18948c80d242..2d95834fc447 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1008,15 +1008,15 @@ std::vector ProcessGroupNCCL::WorkNCCL::result() { c10::intrusive_ptr ProcessGroupNCCL::WorkNCCL:: getFuture() { - TORCH_INTERNAL_ASSERT( - outputs_->size() == 1, - "WorkNCCL's getFuture API is only supported for single-process single-device mode."); - auto deviceIndex = (*outputs_)[0].device().index(); - // Create a new FutureNCCL object after checking for single-process - // single-device mode. + std::vector deviceIndices; + for (const c10::Device& device : devices_) { + TORCH_INTERNAL_ASSERT(device.is_cuda()); + deviceIndices.push_back(device.index()); + } + return c10::make_intrusive( at::IValue(*outputs_), - deviceIndex, + std::move(deviceIndices), cudaEvents_); } diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 9b1145854374..02d262c068a6 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -196,10 +197,8 @@ class ProcessGroupNCCL : public ProcessGroup { // or NCCL's barrier(). // // If created by WorkNCCL's getFuture API, FutureNCCL has a reference to - // WorkNCCL's cudaEvents, NCCL collective's outputs, and the device index of - // outputs' device. Its value is NCCL collective's - // outputs. FutureNCCL only supports single-process single-device mode where - // the size of outputs is equal to 1. + // WorkNCCL's cudaEvents, NCCL collective's outputs, and the device indices of + // outputs' devices. Its value is NCCL collective's outputs. // // If created by FutureNCCL's then callback, its value becomes the value of // callback() and its cudaEvents will record the NCCL stream that runs that @@ -212,28 +211,46 @@ class ProcessGroupNCCL : public ProcessGroup { public: explicit FutureNCCL( at::IValue value, - c10::DeviceIndex deviceIndex, + std::vector deviceIndices, std::shared_ptr> cudaEvents) : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), value_(std::move(value)), - deviceIndex_(deviceIndex), + deviceIndices_(std::move(deviceIndices)), cudaEvents_(std::move(cudaEvents)) { + // Check that the device indices are distinct + std::unordered_set uniqueDeviceIndices; + for (const auto& deviceIndex : deviceIndices_) { + uniqueDeviceIndices.insert(deviceIndex); + } + TORCH_INTERNAL_ASSERT( + deviceIndices_.size() == uniqueDeviceIndices.size(), + "Got ", deviceIndices_.size(), " devices, but only ", + uniqueDeviceIndices.size(), " distinct ones"); TORCH_INTERNAL_ASSERT( - cudaEvents_->size() == 1, - "FutureNCCL only supports single-process single-device mode."); + cudaEvents_->size() == deviceIndices_.size(), + "The device indices and the events must be paired up. Got ", + deviceIndices_.size(), " devices and ", cudaEvents_->size(), + " events."); for (const at::cuda::CUDAEvent& event : *cudaEvents_) { TORCH_INTERNAL_ASSERT(event.isCreated()); - TORCH_INTERNAL_ASSERT(event.device_index() == deviceIndex_); + TORCH_INTERNAL_ASSERT( + std::find( + deviceIndices_.begin(), + deviceIndices_.end(), + event.device_index()) != deviceIndices_.end()); } for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - TORCH_INTERNAL_ASSERT(data_ptr.device().index() == deviceIndex_); + TORCH_INTERNAL_ASSERT( + std::find( + deviceIndices_.begin(), + deviceIndices_.end(), + data_ptr.device().index()) != deviceIndices_.end()); } } private: - explicit FutureNCCL(c10::DeviceIndex deviceIndex) - : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), - deviceIndex_(deviceIndex) {} + FutureNCCL() + : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())) {} // We need this because it will be the ::make() static method that actually // creates the instance. This is a brittle approach and the passkey idiom // would be a more robust solution. However, this will go away in #48505. @@ -248,11 +265,17 @@ class ProcessGroupNCCL : public ProcessGroup { if (error_) { throw *error_; } - auto stream = at::cuda::getCurrentCUDAStream(deviceIndex_); - (*cudaEvents_)[0].block(stream); + + for (int i = 0; i < deviceIndices_.size(); i++) { + (*cudaEvents_)[i].block( + at::cuda::getCurrentCUDAStream(deviceIndices_[i])); + } for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - c10::cuda::CUDACachingAllocator::recordStream(data_ptr, stream); + if (data_ptr.device().is_cuda()) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); + } } } @@ -265,18 +288,25 @@ class ProcessGroupNCCL : public ProcessGroup { "Attempting to set value of a FutureNCCL which has a value." "FutureNCCL's value was internally set to NCCL collective's " "outputs or the return value of the callback."); - for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { - TORCH_INTERNAL_ASSERT(data_ptr.device().index() == deviceIndex_); - } value_ = std::move(value); TORCH_INTERNAL_ASSERT(cudaEvents_ == nullptr); - // Create a new cudaEvents object of size 1 that will record the current - // stream after callback and will be passed to the new FutureNCCL. - cudaEvents_ = std::make_shared>(1); - // In case of chained then callback calls, cudaEvents - // records callback's stream. - (*cudaEvents_)[0].record(at::cuda::getCurrentCUDAStream(deviceIndex_)); + std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + if (data_ptr.device().is_cuda()) { + isCudaDeviceUsed[data_ptr.device().index()] = true; + } + } + + cudaEvents_ = std::make_shared>(); + for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { + if (isCudaDeviceUsed[idx]) { + at::cuda::CUDAEvent cudaEvent; + cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); + deviceIndices_.push_back(idx); + (*cudaEvents_).push_back(std::move(cudaEvent)); + } + } } // Just returns FutureNCCL's value after wait returns. @@ -297,19 +327,37 @@ class ProcessGroupNCCL : public ProcessGroup { // this callback. This new FutureNCCL's cudaEvents will record the // callback's stream and will have the result value of the callback. void addCallback(std::function callback) override { - // FIXME Should we find a way to allow to change the priority of streams? - at::cuda::CUDAStream stream = - at::cuda::getStreamFromPool(/*isHighPriority=*/false, deviceIndex_); + // We'd love to get a stream for all devices, even those that are not used + // by the value, because the callback could use those other devices, but + // unfortunately this could cause a deadlock with NCCL. See + // https://github.com/pytorch/pytorch/pull/48500#issuecomment-735395414 + // In general, if some devices haven't been used yet, by getting a stream + // for them we'd initialize them, and in addition to causing NCCL to + // misbehaving this also ends up using memory on those devices, which the + // user might not want. + std::vector streams; + for (int i = 0; i < deviceIndices_.size(); i++) { + c10::DeviceIndex idx = deviceIndices_[i]; + // FIXME Should we find a way to allow to change the priority of + // streams? + at::cuda::CUDAStream stream = + at::cuda::getStreamFromPool(/*isHighPriority=*/false, idx); + (*cudaEvents_)[i].block(stream); + streams.push_back(stream); + } + + // Use the dedicated callback stream to run callback. + at::cuda::CUDAMultiStreamGuard streamGuard(streams); // Do not free the underlying data storage of value_ before its // usage on the stream finishes. for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - c10::cuda::CUDACachingAllocator::recordStream(data_ptr, stream); + if (data_ptr.device().is_cuda()) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); + } } - (*cudaEvents_)[0].block(stream); - // Use the dedicated callback stream to run callback. - c10::StreamGuard streamGuard{stream}; callback(); } @@ -319,7 +367,7 @@ class ProcessGroupNCCL : public ProcessGroup { c10::intrusive_ptr then( std::function callback, at::TypePtr /* unused */) override { - auto fut = c10::make_intrusive(deviceIndex_); + auto fut = c10::make_intrusive(); // The new future needs the DataPtr extractor when it gets marked complete // but this might happen immediately inline or in parallel by another // thread. In both these cases this would/might happen before the user has @@ -358,7 +406,7 @@ class ProcessGroupNCCL : public ProcessGroup { private: at::IValue value_; - c10::DeviceIndex deviceIndex_; + std::vector deviceIndices_; std::shared_ptr> cudaEvents_; DataPtrExtractor dataPtrExtractor_; std::mutex dataPtrExtractorMutex_; From 9fe3ac3650c51939492cef998cbd787f165b8deb Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 112/250] Don't store device indices separately on FutureNCCL (#48501) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48501 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- FutureNCCL stores a set of devices (on which the tensors in the data reside) and a CUDA event for each of those devices. In fact, each event instance also already contains the device it belongs to, which means we can avoid storing that information separately (with the risk that it'll be mismatched and/or inaccurate). ghstack-source-id: 118180024 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25177554 fbshipit-source-id: 64667c176efc2a7dafe99457a1fbba5d142cb06c --- torch/lib/c10d/ProcessGroupNCCL.cpp | 11 +------ torch/lib/c10d/ProcessGroupNCCL.hpp | 48 ++++++++++------------------- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 2d95834fc447..d9e33ddc33c9 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1008,16 +1008,7 @@ std::vector ProcessGroupNCCL::WorkNCCL::result() { c10::intrusive_ptr ProcessGroupNCCL::WorkNCCL:: getFuture() { - std::vector deviceIndices; - for (const c10::Device& device : devices_) { - TORCH_INTERNAL_ASSERT(device.is_cuda()); - deviceIndices.push_back(device.index()); - } - - return c10::make_intrusive( - at::IValue(*outputs_), - std::move(deviceIndices), - cudaEvents_); + return c10::make_intrusive(at::IValue(*outputs_), cudaEvents_); } void ProcessGroupNCCL::workEnqueue( diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 02d262c068a6..bacb231739b3 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -211,40 +211,28 @@ class ProcessGroupNCCL : public ProcessGroup { public: explicit FutureNCCL( at::IValue value, - std::vector deviceIndices, std::shared_ptr> cudaEvents) : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), value_(std::move(value)), - deviceIndices_(std::move(deviceIndices)), cudaEvents_(std::move(cudaEvents)) { // Check that the device indices are distinct std::unordered_set uniqueDeviceIndices; - for (const auto& deviceIndex : deviceIndices_) { - uniqueDeviceIndices.insert(deviceIndex); - } - TORCH_INTERNAL_ASSERT( - deviceIndices_.size() == uniqueDeviceIndices.size(), - "Got ", deviceIndices_.size(), " devices, but only ", - uniqueDeviceIndices.size(), " distinct ones"); - TORCH_INTERNAL_ASSERT( - cudaEvents_->size() == deviceIndices_.size(), - "The device indices and the events must be paired up. Got ", - deviceIndices_.size(), " devices and ", cudaEvents_->size(), - " events."); for (const at::cuda::CUDAEvent& event : *cudaEvents_) { TORCH_INTERNAL_ASSERT(event.isCreated()); - TORCH_INTERNAL_ASSERT( - std::find( - deviceIndices_.begin(), - deviceIndices_.end(), - event.device_index()) != deviceIndices_.end()); + uniqueDeviceIndices.insert(event.device_index()); } + TORCH_INTERNAL_ASSERT( + cudaEvents_->size() == uniqueDeviceIndices.size(), + "Got ", cudaEvents_->size(), " events, but only ", + uniqueDeviceIndices.size(), " distinct devices"); for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { TORCH_INTERNAL_ASSERT( - std::find( - deviceIndices_.begin(), - deviceIndices_.end(), - data_ptr.device().index()) != deviceIndices_.end()); + std::find_if( + cudaEvents_->begin(), + cudaEvents_->end(), + [&](const at::cuda::CUDAEvent& ev) { + return ev.device_index() == data_ptr.device().index(); + }) != cudaEvents_->end()); } } @@ -266,9 +254,9 @@ class ProcessGroupNCCL : public ProcessGroup { throw *error_; } - for (int i = 0; i < deviceIndices_.size(); i++) { - (*cudaEvents_)[i].block( - at::cuda::getCurrentCUDAStream(deviceIndices_[i])); + for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + cudaEvent.block( + at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); } for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { @@ -303,7 +291,6 @@ class ProcessGroupNCCL : public ProcessGroup { if (isCudaDeviceUsed[idx]) { at::cuda::CUDAEvent cudaEvent; cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); - deviceIndices_.push_back(idx); (*cudaEvents_).push_back(std::move(cudaEvent)); } } @@ -336,13 +323,13 @@ class ProcessGroupNCCL : public ProcessGroup { // misbehaving this also ends up using memory on those devices, which the // user might not want. std::vector streams; - for (int i = 0; i < deviceIndices_.size(); i++) { - c10::DeviceIndex idx = deviceIndices_[i]; + for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + c10::DeviceIndex idx = cudaEvent.device_index(); // FIXME Should we find a way to allow to change the priority of // streams? at::cuda::CUDAStream stream = at::cuda::getStreamFromPool(/*isHighPriority=*/false, idx); - (*cudaEvents_)[i].block(stream); + cudaEvent.block(stream); streams.push_back(stream); } @@ -406,7 +393,6 @@ class ProcessGroupNCCL : public ProcessGroup { private: at::IValue value_; - std::vector deviceIndices_; std::shared_ptr> cudaEvents_; DataPtrExtractor dataPtrExtractor_; std::mutex dataPtrExtractorMutex_; From a6778989d15baf6ae3aa1ce661ee18e5114bd697 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 113/250] Support wider range of types in FutureNCCL (#48502) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48502 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- FutureNCCL restricted the values to be tensors, or (singleton) lists of tensors, or Python object that could be converted to either of those types. We need a CUDA future that can handle more generic types though. The main challenge is extracting all DataPtrs from an arbitrary object. I think I found some ways of doing so, but I'd like some JIT experts to look into this and tell me if there are better ways. I'll add inline comments for where their input would be appreciated. ghstack-source-id: 118180026 Test Plan: Unit tests (I should probably add new ones) Reviewed By: wanchaol Differential Revision: D25177562 fbshipit-source-id: 1ef18e67bf44543c70abb4ca152f1610dea4e533 --- aten/src/ATen/core/ivalue_inl.h | 24 +++++++++++------------- torch/csrc/jit/python/pybind_utils.h | 11 ++++++----- torch/lib/c10d/ProcessGroupNCCL.hpp | 8 +++----- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 32ff811f05bd..d21ea89e4881 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -430,20 +430,18 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { // Expose the default implementation so that external ones can defer to it. static std::vector> defaultDataPtrExtractor(const at::IValue& value) { - // FIXME Should we support more types than just tensors and tensor lists? - TORCH_INTERNAL_ASSERT( - value.isTensorList() || value.isTensor(), - "the future value must be either a tensor list or a tensor."); - at::Tensor tensor; - if (value.isTensorList()) { - const auto tensors = value.toTensorVector(); - TORCH_INTERNAL_ASSERT(tensors.size() == 1, "expected exactly 1 tensor"); - tensor = tensors[0]; - } else { - tensor = value.toTensor(); + at::IValue::HashAliasedIValues sub_values; + // Prefer getSubValues() over visit() as the latter is a silent no-op for + // some unsupported types, whereas the former at least fails loudly. + value.getSubValues(sub_values); + + std::vector> res; + for (const at::IValue& sub_value : sub_values) { + if (sub_value.isTensor()) { + res.emplace_back(sub_value.toTensor().storage().data_ptr()); + } } - - return {tensor.storage().data_ptr()}; + return res; }; // Tries to retrieve the error message from std::exception_ptr. diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index ab77726c11af..dc3b3b13adef 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -61,6 +61,8 @@ inline IValue toIValue( py::object toPyObject(IValue ivalue); +IValue toTypeInferredIValue(py::handle input); + // The PythonFutureWrapper for ivalue::Future // // NB: VISIBILITY_HIDDEN is for silencing compiling error, @@ -220,16 +222,15 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper // This callback is only used by subclasses of Future that deal with CUDA, // in order to register the pointers on the right streams with the caching // allocator. - // By default, assume that the input value is or can be casted into a tensor - // vector that has exactly one tensor. static std::vector> dataPtrExtractor( const at::IValue& value) { if (value.isPyObject()) { pybind11::gil_scoped_acquire gil; py::object obj = torch::jit::toPyObject(value); - // FIXME Should we support more types than just tensor lists? - auto new_value = torch::jit::toIValue( - obj, c10::ListType::create(c10::TensorType::get())); + // FIXME This could fail. As a fallback we could try to pickle the + // object, since the pickler might support broader types and it is able + // to extract the tensors from the object as a vector. + auto new_value = torch::jit::toTypeInferredIValue(obj); return at::ivalue::Future::defaultDataPtrExtractor(new_value); } return at::ivalue::Future::defaultDataPtrExtractor(value); diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index bacb231739b3..7e2d2275da8c 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -237,8 +237,7 @@ class ProcessGroupNCCL : public ProcessGroup { } private: - FutureNCCL() - : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())) {} + FutureNCCL(at::TypePtr type) : at::ivalue::Future(std::move(type)) {} // We need this because it will be the ::make() static method that actually // creates the instance. This is a brittle approach and the passkey idiom // would be a more robust solution. However, this will go away in #48505. @@ -353,8 +352,8 @@ class ProcessGroupNCCL : public ProcessGroup { // stream that runs this callback. c10::intrusive_ptr then( std::function callback, - at::TypePtr /* unused */) override { - auto fut = c10::make_intrusive(); + at::TypePtr type) override { + auto fut = c10::make_intrusive(std::move(type)); // The new future needs the DataPtr extractor when it gets marked complete // but this might happen immediately inline or in parallel by another // thread. In both these cases this would/might happen before the user has @@ -411,7 +410,6 @@ class ProcessGroupNCCL : public ProcessGroup { // If a C++ communication hook is used, use the default extractor. data_ptrs = at::ivalue::Future::defaultDataPtrExtractor(value); } - TORCH_INTERNAL_ASSERT(data_ptrs.size() == 1, "expected exactly 1 tensor"); return data_ptrs; } }; From 9078088edbc20e7edabe72b87339ca67e3cbec75 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 114/250] Split FutureNCCL's CUDA-specific parts from generic future logic (#48504) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48504 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- FutureNCCL isn't just adding CUDA support to ivalue::Future, it's also reimplementing a lot of the latter's logic (by overriding plenty of its methods). That's brittle, as whenever a new method is added to ivalue::Future there's a risk of forgetting to add it to FutureNCCL, and in such a case calling this method on FutureNCCL would defer to the base class and give inconsistent results (e.g., future not being completed when it actually is). This _is already happening_, for example with the waitAndThrow or hasError, which are not implemented by FutureNCCL. In addition, this creates duplication between the two classes, which could lead to inconsistencies of behavior, bugs, missing features, ... The best solution would be to keep the core future logic in ivalue::Future, and have _only_ the CUDA additions in FutureNCCL. That's what we're going to do, in two steps. In this commit, I'll split the CUDA features into separate hooks, which are called by FutureNCCL's other methods. In the next commit, I'll remove these latter methods, and invoke the hooks directly from ivalue::Future. ghstack-source-id: 118180025 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25180534 fbshipit-source-id: 7b3cd374aee78f6c07104daec793c4d248404c61 --- torch/lib/c10d/ProcessGroupNCCL.hpp | 135 ++++++++++++++++------------ 1 file changed, 76 insertions(+), 59 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 7e2d2275da8c..9f339047e7e4 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -253,17 +253,7 @@ class ProcessGroupNCCL : public ProcessGroup { throw *error_; } - for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { - cudaEvent.block( - at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); - } - - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - if (data_ptr.device().is_cuda()) { - c10::cuda::CUDACachingAllocator::recordStream( - data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); - } - } + postWaitHook(); } // If FutureNCCL was created by FutureNCCL::then, its value would be empty @@ -277,22 +267,7 @@ class ProcessGroupNCCL : public ProcessGroup { "outputs or the return value of the callback."); value_ = std::move(value); - TORCH_INTERNAL_ASSERT(cudaEvents_ == nullptr); - std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - if (data_ptr.device().is_cuda()) { - isCudaDeviceUsed[data_ptr.device().index()] = true; - } - } - - cudaEvents_ = std::make_shared>(); - for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { - if (isCudaDeviceUsed[idx]) { - at::cuda::CUDAEvent cudaEvent; - cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); - (*cudaEvents_).push_back(std::move(cudaEvent)); - } - } + postMarkCompletedHook(); } // Just returns FutureNCCL's value after wait returns. @@ -313,38 +288,9 @@ class ProcessGroupNCCL : public ProcessGroup { // this callback. This new FutureNCCL's cudaEvents will record the // callback's stream and will have the result value of the callback. void addCallback(std::function callback) override { - // We'd love to get a stream for all devices, even those that are not used - // by the value, because the callback could use those other devices, but - // unfortunately this could cause a deadlock with NCCL. See - // https://github.com/pytorch/pytorch/pull/48500#issuecomment-735395414 - // In general, if some devices haven't been used yet, by getting a stream - // for them we'd initialize them, and in addition to causing NCCL to - // misbehaving this also ends up using memory on those devices, which the - // user might not want. - std::vector streams; - for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { - c10::DeviceIndex idx = cudaEvent.device_index(); - // FIXME Should we find a way to allow to change the priority of - // streams? - at::cuda::CUDAStream stream = - at::cuda::getStreamFromPool(/*isHighPriority=*/false, idx); - cudaEvent.block(stream); - streams.push_back(stream); - } - - // Use the dedicated callback stream to run callback. - at::cuda::CUDAMultiStreamGuard streamGuard(streams); - - // Do not free the underlying data storage of value_ before its - // usage on the stream finishes. - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - if (data_ptr.device().is_cuda()) { - c10::cuda::CUDACachingAllocator::recordStream( - data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); - } - } - - callback(); + std::function wrappedCallback = + wrapCallback(std::move(callback)); + wrappedCallback(); } // Adds a callback to FutureNCCL, and returns another FutureNCCL to hold @@ -390,6 +336,77 @@ class ProcessGroupNCCL : public ProcessGroup { dataPtrExtractor_ = std::move(dataPtrExtractor); } + protected: + void postMarkCompletedHook() { + TORCH_INTERNAL_ASSERT(cudaEvents_ == nullptr); + std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + if (data_ptr.device().is_cuda()) { + isCudaDeviceUsed[data_ptr.device().index()] = true; + } + } + + cudaEvents_ = std::make_shared>(); + for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { + if (isCudaDeviceUsed[idx]) { + at::cuda::CUDAEvent cudaEvent; + cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); + (*cudaEvents_).push_back(std::move(cudaEvent)); + } + } + } + + std::function wrapCallback(std::function callback) { + return [this, callback{std::move(callback)}]() { + // We'd love to get a stream for all devices, even those that are not used + // by the value, because the callback could use those other devices, but + // unfortunately this could cause a deadlock with NCCL. See + // https://github.com/pytorch/pytorch/pull/48500#issuecomment-735395414 + // In general, if some devices haven't been used yet, by getting a stream + // for them we'd initialize them, and in addition to causing NCCL to + // misbehaving this also ends up using memory on those devices, which the + // user might not want. + std::vector streams; + for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + c10::DeviceIndex idx = cudaEvent.device_index(); + // FIXME Should we find a way to allow to change the priority of + // streams? + at::cuda::CUDAStream stream = + at::cuda::getStreamFromPool(/*isHighPriority=*/false, idx); + cudaEvent.block(stream); + streams.push_back(stream); + } + + // Use the dedicated callback stream to run callback. + at::cuda::CUDAMultiStreamGuard streamGuard(streams); + + // Do not free the underlying data storage of value_ before its + // usage on the stream finishes. + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + if (data_ptr.device().is_cuda()) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); + } + } + + callback(); + }; + } + + void postWaitHook() { + for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + cudaEvent.block( + at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); + } + + for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + if (data_ptr.device().is_cuda()) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); + } + } + } + private: at::IValue value_; std::shared_ptr> cudaEvents_; From 4c425e8da0e36355653b831b444c8f3c0518494d Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 115/250] Merge common parts of FutureNCCL into at::ivalue::Future (#48505) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48505 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- FutureNCCL isn't just adding CUDA support to ivalue::Future, it's also reimplementing a lot of the latter's logic (by overriding plenty of its methods). That's brittle, as whenever a new method is added to ivalue::Future there's a risk of forgetting to add it to FutureNCCL, and in such a case calling this method on FutureNCCL would defer to the base class and give inconsistent results (e.g., future not being completed when it actually is). This _is already happening_, for example with the waitAndThrow or hasError, which are not implemented by FutureNCCL. In addition, this creates duplication between the two classes, which could lead to inconsistencies of behavior, bugs, missing features, ... The best solution would be to keep the core future logic in ivalue::Future, and have _only_ the CUDA additions in FutureNCCL. That's what we're going to do, in two steps. In the previous commit, I split the CUDA features into separate hooks, which are called by FutureNCCL's other methods. In this commit, I'm removing these latter methods, and invoke the hooks directly from ivalue::Future. ghstack-source-id: 118180032 Test Plan: Unit tests Reviewed By: wanchaol Differential Revision: D25180535 fbshipit-source-id: 19181fe133152044eb677062a9e31e5e4ad3c03c --- aten/src/ATen/core/ivalue_inl.h | 78 +++++++--- .../ddp_comm_hooks/powerSGD_hook.py | 2 +- torch/lib/c10d/ProcessGroupNCCL.hpp | 135 ++++-------------- 3 files changed, 89 insertions(+), 126 deletions(-) diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index d21ea89e4881..8858d0047abd 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -290,18 +290,22 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { /** * Wait on the future until it completes. */ - virtual void wait() { + void wait() { std::unique_lock lock(mutex_); while (!completed_) { finished_cv_.wait(lock); } + + if (!eptr_) { + postWaitHook(value_); + } } /** * Wait on the future until it completes and throw an * exception if an error exists. */ - virtual void waitAndThrow() { + void waitAndThrow() { std::unique_lock lock(mutex_); while (!completed_) { finished_cv_.wait(lock); @@ -310,12 +314,14 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { if (eptr_) { std::rethrow_exception(eptr_); } + + postWaitHook(value_); } /** * Explicitly mark the future as completed with the output value. */ - virtual void markCompleted(IValue value) { + void markCompleted(IValue value) { std::unique_lock lock(mutex_); TORCH_CHECK( !completed(), @@ -324,6 +330,8 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { completed_ = true; value_ = std::move(value); + postMarkCompletedHook(value_); + std::vector> cbs; cbs.swap(callbacks_); lock.unlock(); @@ -359,7 +367,7 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { } // Get the result of the current future. - virtual IValue value() { + IValue value() { std::unique_lock lock(mutex_); AT_ASSERT(completed()); if (eptr_) { @@ -370,7 +378,7 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { // This accessor should only be used if we know that the future is // completed() with no error. - virtual const IValue& constValue() { + const IValue& constValue() { std::unique_lock lock(mutex_); AT_ASSERT(completed()); AT_ASSERT(!eptr_); @@ -383,8 +391,9 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { * If the future has already completed, * this function will execute the callback immediately. */ - virtual void addCallback(std::function callback) { + void addCallback(std::function callback) { std::unique_lock lock(mutex_); + callback = wrapCallback(std::move(callback)); if (completed()) { lock.unlock(); callback(); @@ -398,22 +407,18 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { * value of the callback. This is necessary when the callback provider needs * to know for sure when the callback has finished. */ - virtual c10::intrusive_ptr then( + c10::intrusive_ptr then( std::function callback, TypePtr type) { - auto fut = c10::make_intrusive(type); - // Cannot move capture std::function in lambda, because it cannot deduce - // the template type for std::function. Hence use std::bind to explicitly - // specify types. - addCallback(std::bind( - [fut](std::function cb) { + auto fut = createInstance(std::move(type)); + addCallback( + [fut, cb = std::move(callback)]() { try { fut->markCompleted(cb()); } catch (std::exception&) { fut->setError(std::current_exception()); } - }, - std::move(callback))); + }); return fut; } @@ -452,11 +457,11 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { } // Check if the current future has completed - virtual bool completed() const { + bool completed() const { return completed_; } - virtual bool hasValue() const { + bool hasValue() const { std::unique_lock lock(mutex_); return completed_ && !eptr_; } @@ -479,6 +484,43 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { return type_; } + protected: + // This hook is called by this class's then() method when it prepares the + // instance it returns to the caller. It should be overridden by subclasses so + // that they can produce an instace of their own type. + virtual c10::intrusive_ptr createInstance(at::TypePtr type) { + return c10::make_intrusive(type); + } + + // This hook will be called by this class (the superclass) when the future is + // marked completed _with a value_ (hence not in case of error). This is done + // right away, while the mutex is still held, before any callbacks are run. + // It allows subclasses to further update their state if they so need. For + // example the CUDAFuture subclass uses it to determine what devices the value + // resides on and record an event in those devices' current streams. + virtual void postMarkCompletedHook(const at::IValue& value) {} + + // This hook will be called by the addCallback() and the then() methods before + // storing the callback for later execution (or before running it inline if + // the future is already complete). Note that this method could thus be called + // while the future is _not_ yet complete. By default this method does nothing + // but subclasses can override this method to add functionality. For example + // the CUDAFuture subclass ensures the callback runs with CUDA streams which + // are synchronized with the events recorded in the I/O streams. + virtual std::function wrapCallback( + std::function callback) { + return callback; + } + + // This hook will be called by this class after a user thread has completed + // waiting on a successful future. It will thus not be called if the future + // completes with an error. It will also not be called if the user accesses + // the future's value without synchronization. Subclasses can override this + // to add some synchronization to the wait. For example, the CUDAFuture + // subclass ensures the user's current CUDA streams synchronize with the I/O + // events stored by the future. + virtual void postWaitHook(const at::IValue& value) {} + private: void setErrorInternal( std::exception_ptr eptr, @@ -487,6 +529,8 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target { completed_ = true; eptr_ = std::move(eptr); + // Do not call postMarkCompletedHook() here as there isn't any value. + std::vector> cbs; cbs.swap(callbacks_); lock.unlock(); diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 81b876685a3c..99ba72cc5868 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -191,7 +191,7 @@ def compute_q(fut): return [ dist.all_reduce(q, group=group_to_use, async_op=True) .get_future() - .value()[0] + .wait()[0] ] def decompress(fut): diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 9f339047e7e4..23bc12390541 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -213,7 +213,6 @@ class ProcessGroupNCCL : public ProcessGroup { at::IValue value, std::shared_ptr> cudaEvents) : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), - value_(std::move(value)), cudaEvents_(std::move(cudaEvents)) { // Check that the device indices are distinct std::unordered_set uniqueDeviceIndices; @@ -225,7 +224,7 @@ class ProcessGroupNCCL : public ProcessGroup { cudaEvents_->size() == uniqueDeviceIndices.size(), "Got ", cudaEvents_->size(), " events, but only ", uniqueDeviceIndices.size(), " distinct devices"); - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { TORCH_INTERNAL_ASSERT( std::find_if( cudaEvents_->begin(), @@ -234,71 +233,18 @@ class ProcessGroupNCCL : public ProcessGroup { return ev.device_index() == data_ptr.device().index(); }) != cudaEvents_->end()); } + markCompleted(std::move(value)); } - private: - FutureNCCL(at::TypePtr type) : at::ivalue::Future(std::move(type)) {} - // We need this because it will be the ::make() static method that actually - // creates the instance. This is a brittle approach and the passkey idiom - // would be a more robust solution. However, this will go away in #48505. - friend c10::intrusive_ptr; + using at::ivalue::Future::Future; - public: - // Gets the current stream of the device and synchronizes recorded streams - // with that. It will return after synchronizing the correct GPU streams to - // ensure we can have async CUDA execution and it does not wait for the - // entire operation to complete on GPU. - void wait() override { - if (error_) { - throw *error_; - } - - postWaitHook(); - } - - // If FutureNCCL was created by FutureNCCL::then, its value would be empty - // initially. FutureNCCL::then will later use this method to set its value - // to the return value of the callback. - void markCompleted(at::IValue value) override { - TORCH_INTERNAL_ASSERT( - value_.isNone(), - "Attempting to set value of a FutureNCCL which has a value." - "FutureNCCL's value was internally set to NCCL collective's " - "outputs or the return value of the callback."); - value_ = std::move(value); - - postMarkCompletedHook(); - } - - // Just returns FutureNCCL's value after wait returns. - at::IValue value() override { - TORCH_INTERNAL_ASSERT(hasValue(), "FutureNCCL's value is None.") - wait(); - return value_; - } - - const at::IValue& constValue() override { - TORCH_INTERNAL_ASSERT(hasValue(), "FutureNCCL's value is None.") - wait(); - return value_; - } - - // Adds a callback to FutureNCCL. It invokes the callback inline after - // synchronizing FutureNCCL's own cudaEvents with the stream that runs - // this callback. This new FutureNCCL's cudaEvents will record the - // callback's stream and will have the result value of the callback. - void addCallback(std::function callback) override { - std::function wrappedCallback = - wrapCallback(std::move(callback)); - wrappedCallback(); + void setDataPtrExtractor(DataPtrExtractor dataPtrExtractor) override { + std::unique_lock lock(dataPtrExtractorMutex_); + dataPtrExtractor_ = std::move(dataPtrExtractor); } - // Adds a callback to FutureNCCL, and returns another FutureNCCL to hold - // the return value of the callback and new cudaEvents that recorded the - // stream that runs this callback. - c10::intrusive_ptr then( - std::function callback, - at::TypePtr type) override { + protected: + c10::intrusive_ptr createInstance(at::TypePtr type) override { auto fut = c10::make_intrusive(std::move(type)); // The new future needs the DataPtr extractor when it gets marked complete // but this might happen immediately inline or in parallel by another @@ -307,56 +253,31 @@ class ProcessGroupNCCL : public ProcessGroup { // if the default extractor can't handle some of the user's types. // Therefore we propagate our extractor. fut->setDataPtrExtractor(dataPtrExtractor_); - - // Cannot move capture std::function in lambda, because it cannot deduce - // the template type for std::function. Hence use std::bind to explicitly - // specify types. - addCallback(std::bind( - [&](std::function cb) { - try { - fut->markCompleted(at::IValue(cb())); - } catch (const std::exception& e) { - fut->setError(std::current_exception()); - } - }, - std::move(callback))); return fut; } - bool completed() const override { - return true; - } - - bool hasValue() const override { - return !value_.isNone(); - } - - void setDataPtrExtractor(DataPtrExtractor dataPtrExtractor) override { - std::unique_lock lock(dataPtrExtractorMutex_); - dataPtrExtractor_ = std::move(dataPtrExtractor); - } - - protected: - void postMarkCompletedHook() { - TORCH_INTERNAL_ASSERT(cudaEvents_ == nullptr); - std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { - if (data_ptr.device().is_cuda()) { - isCudaDeviceUsed[data_ptr.device().index()] = true; + void postMarkCompletedHook(const at::IValue& value) override { + // Check whether the first or second constructor created this instance. + if (cudaEvents_ == nullptr) { + std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); + for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + if (data_ptr.device().is_cuda()) { + isCudaDeviceUsed[data_ptr.device().index()] = true; + } } - } - cudaEvents_ = std::make_shared>(); - for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { - if (isCudaDeviceUsed[idx]) { - at::cuda::CUDAEvent cudaEvent; - cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); - (*cudaEvents_).push_back(std::move(cudaEvent)); + cudaEvents_ = std::make_shared>(); + for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { + if (isCudaDeviceUsed[idx]) { + at::cuda::CUDAEvent cudaEvent; + cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); + (*cudaEvents_).push_back(std::move(cudaEvent)); + } } } } - std::function wrapCallback(std::function callback) { + std::function wrapCallback(std::function callback) override { return [this, callback{std::move(callback)}]() { // We'd love to get a stream for all devices, even those that are not used // by the value, because the callback could use those other devices, but @@ -382,7 +303,7 @@ class ProcessGroupNCCL : public ProcessGroup { // Do not free the underlying data storage of value_ before its // usage on the stream finishes. - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + for (const at::DataPtr& data_ptr : extractDataPtrs(constValue())) { if (data_ptr.device().is_cuda()) { c10::cuda::CUDACachingAllocator::recordStream( data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); @@ -393,13 +314,13 @@ class ProcessGroupNCCL : public ProcessGroup { }; } - void postWaitHook() { + void postWaitHook(const at::IValue& value) override { for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { cudaEvent.block( at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); } - for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) { + for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { if (data_ptr.device().is_cuda()) { c10::cuda::CUDACachingAllocator::recordStream( data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); @@ -408,11 +329,9 @@ class ProcessGroupNCCL : public ProcessGroup { } private: - at::IValue value_; std::shared_ptr> cudaEvents_; DataPtrExtractor dataPtrExtractor_; std::mutex dataPtrExtractorMutex_; - c10::optional error_; std::vector> extractDataPtrs( const at::IValue& value) { From 030fa6cfba69da6342f89483ceca8c898b93f165 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 116/250] Split out reusable CUDAFuture from FutureNCCL (#48506) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48506 This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- FutureNCCL is now a general-purpose type-agnostic multi-device class, so in this commit I extract it from ProcessGroupNCCL to make it available for wider use (notably by the RPC module). We'll call this new class CUDAFuture. We'll keep FutureNCCL as a subclass of CUDAFuture to deal with some NCCL peculiarity, namely the fact that the future becomes complete immediately upon creation. We can clean this up for good once we're done merging Future and Work. I'm not exactly sure of where to put CUDAFuture. It needs to be available to both c10d and RPC (which lives under torch/csrc). If I figured CMake out correctly (and that's a big if) I think c10d can only depend on ATen (I'll maybe add a comment with how I tracked that down). Hence we cannot put CUDAFuture in torch/csrc. On the other hand, RPC currently depends on c10d, because RPC agents use ProcessGroups internally, so it would be "ok" to put CUDAFuture in c10d. However, we want to get rid of ProcessGroups in RPC, and at that point RPC should in principle not depend on c10d. In that case, the only shared dep between the two that I see is ATen itself. While I'm a bit wary of putting it right in ATen, I think it might actually make sense. CUDAFuture is intended to be a general-purpose component that can be reused in all settings and is not particularly tied to c10d or RPC. Moreover, ATen already contains ivalue::Future, and it contains a lot of CUDA helpers, so CUDAFuture definitely belongs to the "closure" of what's already there. ghstack-source-id: 118180030 Test Plan: Unit tests? Reviewed By: wanchaol Differential Revision: D25180532 fbshipit-source-id: 697f655240dbdd3be22a568d5102ab27691f86d4 --- aten/src/ATen/cuda/CUDAFuture.h | 153 ++++++++++++++++++++++++++++ torch/lib/c10d/ProcessGroupNCCL.hpp | 131 +++--------------------- 2 files changed, 165 insertions(+), 119 deletions(-) create mode 100644 aten/src/ATen/cuda/CUDAFuture.h diff --git a/aten/src/ATen/cuda/CUDAFuture.h b/aten/src/ATen/cuda/CUDAFuture.h new file mode 100644 index 000000000000..7db95ba3f734 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAFuture.h @@ -0,0 +1,153 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { namespace cuda { + +struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { + public: + using at::ivalue::Future::Future; + + void setDataPtrExtractor(DataPtrExtractor dataPtrExtractor) override { + std::unique_lock lock(dataPtrExtractorMutex_); + dataPtrExtractor_ = std::move(dataPtrExtractor); + } + + protected: + c10::intrusive_ptr createInstance(at::TypePtr type) override { + auto fut = c10::make_intrusive(std::move(type)); + // The new future needs the DataPtr extractor when it gets marked complete + // but this might happen immediately inline or in parallel by another + // thread. In both these cases this would/might happen before the user has + // time to set their own DataPtr extractor, which might lead to failures + // if the default extractor can't handle some of the user's types. + // Therefore we propagate our extractor. + fut->setDataPtrExtractor(dataPtrExtractor_); + return fut; + } + + void postMarkCompletedHook(const at::IValue& value) override { + std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); + for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + if (data_ptr.device().is_cuda()) { + isCudaDeviceUsed[data_ptr.device().index()] = true; + } + } + + cudaEvents_ = std::make_shared>(); + for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { + if (isCudaDeviceUsed[idx]) { + at::cuda::CUDAEvent cudaEvent; + cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); + (*cudaEvents_).push_back(std::move(cudaEvent)); + } + } + } + + std::function wrapCallback( + std::function callback) override { + return [this, callback{std::move(callback)}]() { + // We'd love to get a stream for all devices, even those that are not used + // by the value, because the callback could use those other devices, but + // unfortunately this could cause a deadlock with NCCL. See + // https://github.com/pytorch/pytorch/pull/48500#issuecomment-735395414 + // In general, if some devices haven't been used yet, by getting a stream + // for them we'd initialize them, and in addition to causing NCCL to + // misbehaving this also ends up using memory on those devices, which the + // user might not want. + std::vector streams; + for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + c10::DeviceIndex idx = cudaEvent.device_index(); + // FIXME Should we find a way to allow to change the priority of + // streams? + at::cuda::CUDAStream stream = + at::cuda::getStreamFromPool(/*isHighPriority=*/false, idx); + cudaEvent.block(stream); + streams.push_back(stream); + } + + // Use the dedicated callback stream to run callback. + at::cuda::CUDAMultiStreamGuard streamGuard(streams); + + // Do not free the underlying data storage of value_ before its + // usage on the stream finishes. + for (const at::DataPtr& data_ptr : extractDataPtrs(constValue())) { + if (data_ptr.device().is_cuda()) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); + } + } + + callback(); + }; + } + + void postWaitHook(const at::IValue& value) override { + for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + cudaEvent.block( + at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); + } + + for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + if (data_ptr.device().is_cuda()) { + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); + } + } + } + + // FIXME This field is protected (rather than private) and wrapped in a + // shared_ptr in order to support the FutureNCCL subclass, which wants to set + // the events on its own in order to use the same ones as its WorkNCCL class. + // Once WorkNCCL is gone (as part of the Future and Work merge) this should be + // fixed. + protected: + // The events that correspond to the completion of the async I/O kernels. They + // are recorded on the appropriate streams when the future is marked completed + // and can then be queried/waited/blocked on. There is one event for each + // distinct device on which the value's tensors reside. + std::shared_ptr> cudaEvents_; + + private: + DataPtrExtractor dataPtrExtractor_; + std::mutex dataPtrExtractorMutex_; + + // FIXME This too is protected so that it can be used by FutureNCCL. Please + // undo that once FutureNCCL is dropped in favor of a "vanilla" CUDAFuture. + protected: + std::vector> extractDataPtrs( + const at::IValue& value) { + std::unique_lock lock(dataPtrExtractorMutex_); + std::vector> data_ptrs; + if (dataPtrExtractor_ != nullptr) { + // If a Python communication hook is used, dataPtrExtractor_ will be + // set in torch/csrc/jit/python/pybind_utils.h, which allows Python + // dependency to be imported. + data_ptrs = dataPtrExtractor_(value); + } else { + // If a C++ communication hook is used, use the default extractor. + data_ptrs = at::ivalue::Future::defaultDataPtrExtractor(value); + } + return data_ptrs; + } +}; + +} // namespace cuda +} // namespace at diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 23bc12390541..0bd04645dac2 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -207,146 +208,38 @@ class ProcessGroupNCCL : public ProcessGroup { // enables synchronizing the appropriate streams and avoids stalling PyTorch's // default stream while running the callback. In case of multiple then // callbacks, each will be executed on its own fresh stream. - struct FutureNCCL : at::ivalue::Future { + struct FutureNCCL : at::cuda::CUDAFuture { public: - explicit FutureNCCL( + FutureNCCL( at::IValue value, std::shared_ptr> cudaEvents) - : at::ivalue::Future(c10::ListType::create(c10::TensorType::get())), - cudaEvents_(std::move(cudaEvents)) { + : at::cuda::CUDAFuture(c10::ListType::create(c10::TensorType::get())){ // Check that the device indices are distinct std::unordered_set uniqueDeviceIndices; - for (const at::cuda::CUDAEvent& event : *cudaEvents_) { + for (const at::cuda::CUDAEvent& event : *cudaEvents) { TORCH_INTERNAL_ASSERT(event.isCreated()); uniqueDeviceIndices.insert(event.device_index()); } TORCH_INTERNAL_ASSERT( - cudaEvents_->size() == uniqueDeviceIndices.size(), - "Got ", cudaEvents_->size(), " events, but only ", + cudaEvents->size() == uniqueDeviceIndices.size(), + "Got ", cudaEvents->size(), " events, but only ", uniqueDeviceIndices.size(), " distinct devices"); for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { TORCH_INTERNAL_ASSERT( std::find_if( - cudaEvents_->begin(), - cudaEvents_->end(), + cudaEvents->begin(), + cudaEvents->end(), [&](const at::cuda::CUDAEvent& ev) { return ev.device_index() == data_ptr.device().index(); - }) != cudaEvents_->end()); + }) != cudaEvents->end()); } + cudaEvents_ = std::move(cudaEvents); markCompleted(std::move(value)); } - using at::ivalue::Future::Future; - - void setDataPtrExtractor(DataPtrExtractor dataPtrExtractor) override { - std::unique_lock lock(dataPtrExtractorMutex_); - dataPtrExtractor_ = std::move(dataPtrExtractor); - } - protected: - c10::intrusive_ptr createInstance(at::TypePtr type) override { - auto fut = c10::make_intrusive(std::move(type)); - // The new future needs the DataPtr extractor when it gets marked complete - // but this might happen immediately inline or in parallel by another - // thread. In both these cases this would/might happen before the user has - // time to set their own DataPtr extractor, which might lead to failures - // if the default extractor can't handle some of the user's types. - // Therefore we propagate our extractor. - fut->setDataPtrExtractor(dataPtrExtractor_); - return fut; - } - void postMarkCompletedHook(const at::IValue& value) override { - // Check whether the first or second constructor created this instance. - if (cudaEvents_ == nullptr) { - std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); - for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { - if (data_ptr.device().is_cuda()) { - isCudaDeviceUsed[data_ptr.device().index()] = true; - } - } - - cudaEvents_ = std::make_shared>(); - for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { - if (isCudaDeviceUsed[idx]) { - at::cuda::CUDAEvent cudaEvent; - cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); - (*cudaEvents_).push_back(std::move(cudaEvent)); - } - } - } - } - - std::function wrapCallback(std::function callback) override { - return [this, callback{std::move(callback)}]() { - // We'd love to get a stream for all devices, even those that are not used - // by the value, because the callback could use those other devices, but - // unfortunately this could cause a deadlock with NCCL. See - // https://github.com/pytorch/pytorch/pull/48500#issuecomment-735395414 - // In general, if some devices haven't been used yet, by getting a stream - // for them we'd initialize them, and in addition to causing NCCL to - // misbehaving this also ends up using memory on those devices, which the - // user might not want. - std::vector streams; - for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { - c10::DeviceIndex idx = cudaEvent.device_index(); - // FIXME Should we find a way to allow to change the priority of - // streams? - at::cuda::CUDAStream stream = - at::cuda::getStreamFromPool(/*isHighPriority=*/false, idx); - cudaEvent.block(stream); - streams.push_back(stream); - } - - // Use the dedicated callback stream to run callback. - at::cuda::CUDAMultiStreamGuard streamGuard(streams); - - // Do not free the underlying data storage of value_ before its - // usage on the stream finishes. - for (const at::DataPtr& data_ptr : extractDataPtrs(constValue())) { - if (data_ptr.device().is_cuda()) { - c10::cuda::CUDACachingAllocator::recordStream( - data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); - } - } - - callback(); - }; - } - - void postWaitHook(const at::IValue& value) override { - for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { - cudaEvent.block( - at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); - } - - for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { - if (data_ptr.device().is_cuda()) { - c10::cuda::CUDACachingAllocator::recordStream( - data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); - } - } - } - - private: - std::shared_ptr> cudaEvents_; - DataPtrExtractor dataPtrExtractor_; - std::mutex dataPtrExtractorMutex_; - - std::vector> extractDataPtrs( - const at::IValue& value) { - std::unique_lock lock(dataPtrExtractorMutex_); - std::vector> data_ptrs; - if (dataPtrExtractor_ != nullptr) { - // If a Python communication hook is used, dataPtrExtractor_ will be - // set in torch/csrc/jit/python/pybind_utils.h, which allows Python - // dependency to be imported. - data_ptrs = dataPtrExtractor_(value); - } else { - // If a C++ communication hook is used, use the default extractor. - data_ptrs = at::ivalue::Future::defaultDataPtrExtractor(value); - } - return data_ptrs; + // Do nothing because the constructor already stored the events. } }; From b5a7e25059e4d2bd679df97d5c992c0d1da787e1 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 03:45:30 -0800 Subject: [PATCH 117/250] Cache the DataPtrs in CUDAFuture (#48788) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48788 CUDAFuture needs to inspect the value it contains in order to first determine what devices its tensors reside on (so that it can record events on those devices), and then to record these tensors with the caching allocator when they are used in other streams. Extracting data ptrs can become somewhat expensive (especially if we resort to using the pickler to do that), hence it's probably a good idea to cache the result the first time we compute it. ghstack-source-id: 118180023 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25303486 fbshipit-source-id: 5c541640f6d19249dfb5489ba5e8fad2502836fb --- aten/src/ATen/cuda/CUDAFuture.h | 13 ++++++++++--- torch/lib/c10d/ProcessGroupNCCL.hpp | 4 +++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/cuda/CUDAFuture.h b/aten/src/ATen/cuda/CUDAFuture.h index 7db95ba3f734..78499f7fc026 100644 --- a/aten/src/ATen/cuda/CUDAFuture.h +++ b/aten/src/ATen/cuda/CUDAFuture.h @@ -44,8 +44,11 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { } void postMarkCompletedHook(const at::IValue& value) override { + // Extract them once and cache them for later uses. + dataPtrs_ = extractDataPtrs(value); + std::vector isCudaDeviceUsed(c10::cuda::device_count(), false); - for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + for (const at::DataPtr& data_ptr : dataPtrs_) { if (data_ptr.device().is_cuda()) { isCudaDeviceUsed[data_ptr.device().index()] = true; } @@ -88,7 +91,7 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { // Do not free the underlying data storage of value_ before its // usage on the stream finishes. - for (const at::DataPtr& data_ptr : extractDataPtrs(constValue())) { + for (const at::DataPtr& data_ptr : dataPtrs_) { if (data_ptr.device().is_cuda()) { c10::cuda::CUDACachingAllocator::recordStream( data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); @@ -105,7 +108,7 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); } - for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + for (const at::DataPtr& data_ptr : dataPtrs_) { if (data_ptr.device().is_cuda()) { c10::cuda::CUDACachingAllocator::recordStream( data_ptr, at::cuda::getCurrentCUDAStream(data_ptr.device().index())); @@ -125,6 +128,10 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { // distinct device on which the value's tensors reside. std::shared_ptr> cudaEvents_; + // A cached version of the data ptrs extracted from the value when the future + // is first marked completed. + std::vector> dataPtrs_; + private: DataPtrExtractor dataPtrExtractor_; std::mutex dataPtrExtractorMutex_; diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 0bd04645dac2..45c82140c484 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -224,7 +224,8 @@ class ProcessGroupNCCL : public ProcessGroup { cudaEvents->size() == uniqueDeviceIndices.size(), "Got ", cudaEvents->size(), " events, but only ", uniqueDeviceIndices.size(), " distinct devices"); - for (const at::DataPtr& data_ptr : extractDataPtrs(value)) { + auto dataPtrs = extractDataPtrs(value); + for (const at::DataPtr& data_ptr : dataPtrs) { TORCH_INTERNAL_ASSERT( std::find_if( cudaEvents->begin(), @@ -234,6 +235,7 @@ class ProcessGroupNCCL : public ProcessGroup { }) != cudaEvents->end()); } cudaEvents_ = std::move(cudaEvents); + dataPtrs_ = std::move(dataPtrs); markCompleted(std::move(value)); } From 2255e68da84af950225d4d49cfc330215a036591 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 10 Dec 2020 04:37:41 -0800 Subject: [PATCH 118/250] Revert D25433268: [PyTorch Mobile] Preserve bundled input related methods when calling optimize_for_mobile Test Plan: revert-hammer Differential Revision: D25433268 (https://github.com/pytorch/pytorch/commit/95233870f284565df602471bf09b8aad565540b8) Original commit changeset: 0bf9b4afe64b fbshipit-source-id: bba97e48ce0e72f9d1db5159065bb6495d62666c --- test/test_mobile_optimizer.py | 64 --------------------------------- torch/utils/mobile_optimizer.py | 12 ------- 2 files changed, 76 deletions(-) diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py index 9bfe5465a458..0af74eabdf2b 100644 --- a/test/test_mobile_optimizer.py +++ b/test/test_mobile_optimizer.py @@ -8,7 +8,6 @@ from torch.nn import functional as F from torch._C import MobileOptimizerType from torch.testing._internal.common_quantized import override_quantized_engine -from torch.nn.modules.module import ModuleAttributeError FileCheck = torch._C.FileCheck @@ -269,69 +268,6 @@ def get_lint_count_by_type(lint_type, module_lint_List): bi_module_lint_list = generate_mobile_module_lints(bi_module) self.assertEqual(len(bi_module_lint_list), 0) - def test_preserve_bundled_inputs_methods(self): - class MyBundledInputModule(torch.nn.Module): - def __init__(self): - super(MyBundledInputModule, self).__init__() - - def forward(self, inputs): - return inputs - - class MyIncompleteBundledInputModule(torch.nn.Module): - def __init__(self): - super(MyIncompleteBundledInputModule, self).__init__() - - def forward(self, inputs): - return inputs - - @torch.jit.export - def get_all_bundled_inputs(self): - pass - - bi_module = torch.jit.script(MyBundledInputModule()) - module_optim_bi_not_preserved = optimize_for_mobile(bi_module) - - # Expected to be False since no bundled inputs methods were added - self.assertFalse( - hasattr(module_optim_bi_not_preserved, 'get_all_bundled_inputs') or - hasattr(module_optim_bi_not_preserved, 'get_num_bundled_inputs') or - hasattr(module_optim_bi_not_preserved, 'run_on_bundled_input') - ) - - # We expect an exception here - with self.assertRaises(ModuleAttributeError): - module_optim_bi_not_preserved.run_on_bundled_input(0) - - # Add bundled inputs methods to the module - torch.utils.bundled_inputs.augment_model_with_bundled_inputs( - bi_module, [(torch.tensor([1]),)], []) - # Now they should be preserved - module_optim_bi_preserved = optimize_for_mobile(bi_module) - - # All of the bundled inputs methods were preserved - self.assertTrue( - hasattr(module_optim_bi_preserved, 'get_all_bundled_inputs') and - hasattr(module_optim_bi_preserved, 'get_num_bundled_inputs') and - hasattr(module_optim_bi_preserved, 'run_on_bundled_input') - ) - - # We do not expect an exception here - module_optim_bi_preserved.run_on_bundled_input(0) - - bundled_input = module_optim_bi_preserved.get_all_bundled_inputs()[0] - module_optim_bi_preserved(*bundled_input) - - # If not all 3 bundled inputs methods are present in the module, - # we will not try to preserve them unless specified by the user. - incomplete_bi_module = torch.jit.script(MyIncompleteBundledInputModule()) - incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module) - self.assertFalse(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs')) - - # Specifically preserve get_all_bundled_inputs even if it's the only one - # bundled inputs method available. - incomplete_bi_module_optim = optimize_for_mobile(incomplete_bi_module, preserved_methods=['get_all_bundled_inputs']) - self.assertTrue(hasattr(incomplete_bi_module_optim, 'get_all_bundled_inputs')) - @unittest.skipUnless(torch.backends.xnnpack.enabled, " XNNPACK must be enabled for these tests." " Please build with USE_XNNPACK=1.") diff --git a/torch/utils/mobile_optimizer.py b/torch/utils/mobile_optimizer.py index 8029084a77a7..a9bbbfb9e6ac 100644 --- a/torch/utils/mobile_optimizer.py +++ b/torch/utils/mobile_optimizer.py @@ -39,18 +39,6 @@ def optimize_for_mobile( if preserved_methods is None: preserved_methods = [] - bundled_inputs_methods = ['get_all_bundled_inputs', 'get_num_bundled_inputs', 'run_on_bundled_input'] - if all([hasattr(script_module, method) for method in bundled_inputs_methods]): - preserved_methods = list(set(preserved_methods + bundled_inputs_methods)) - - non_exist_methods = [] - for method in preserved_methods: - if not hasattr(script_module, method): - non_exist_methods.append(method) - if non_exist_methods: - raise AttributeError( - 'The following methods to preserve do not exist in script_module: {}'.format(', '.join(non_exist_methods))) - backend = backend.lower() if backend == 'cpu': optimized_cpp_module = torch._C._jit_pass_optimize_for_mobile(script_module._c, optimization_blocklist, preserved_methods) From c7b8f3e2cd2b4beca7c7d6446364cc6be957a192 Mon Sep 17 00:00:00 2001 From: Martin Yuan Date: Thu, 10 Dec 2020 05:32:49 -0800 Subject: [PATCH 119/250] Decouple direct access to native::scalar_tensor from TensorIndexing.h (#48761) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48761 Targeting one of the items in https://github.com/pytorch/pytorch/issues/48684. For performance purpose we don't use at::scalar_tensor. Since scalar_tensor_static is available for CPU we could use it at least for CPU. One uncertainty is the CUDA performance. But there's no fast path for CUDA under native::scalar_tensor either, I assume the perf on CUDA may not be affected. Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D25410975 Pulled By: iseeyuan fbshipit-source-id: 160d21ffeefc9a2e8f00a55043144eebcada2aac --- aten/src/ATen/TensorIndexing.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 4b6f81bc4c21..a4c0a0b31c34 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -4,6 +4,7 @@ #include #include #include +#include // TODO: try to remove this // There is some back story, see https://github.com/pytorch/pytorch/issues/48684 @@ -249,10 +250,6 @@ static inline Tensor boolToIndexingTensor(const Tensor& self, bool value, const } } -static inline Tensor scalarToTensorCPUOrCUDA(Scalar v, const TensorOptions& options) { - return at::native::scalar_tensor(v, options); -} - static inline Tensor scalarToTensorNonNativeDeviceType(Scalar v, const TensorOptions& options) { return at::scalar_tensor(v, options); } @@ -320,8 +317,11 @@ static inline int64_t count_specified_dimensions(const ArrayRef& in // The rest of the functions are in `at::indexing::impl` namespace, signifying // that they shouldn't be used from Python indexing implementation. static inline Tensor scalarToTensor(Scalar v, const TensorOptions& options, const at::Device& self_device) { - if (self_device == at::kCPU || self_device == at::kCUDA) { - return impl::scalarToTensorCPUOrCUDA(v, options); + if (self_device == at::kCPU && !v.isComplex() && + options.dtype_opt()->toScalarType() != ScalarType::ComplexDouble && + options.dtype_opt()->toScalarType() != ScalarType::ComplexFloat && + options.dtype_opt()->toScalarType() != ScalarType::ComplexHalf) { + return at::detail::scalar_tensor_static(v, options.dtype_opt()->toScalarType(), self_device); } else { return impl::scalarToTensorNonNativeDeviceType(v, options); } From 33bc7918e82c3d290fd111718a0ce166970603d4 Mon Sep 17 00:00:00 2001 From: Wang Xu Date: Thu, 10 Dec 2020 07:04:02 -0800 Subject: [PATCH 120/250] fix some comments in accelerator_partitioner.py (#49104) Summary: Fix some comments in accelerator_partittioner.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/49104 Reviewed By: gcatron Differential Revision: D25434999 Pulled By: scottxu0730 fbshipit-source-id: ce83b411cf959aabec119532ad42a892a2223286 --- .../experimental/accelerator_partitioner.py | 122 ++++++++++-------- 1 file changed, 69 insertions(+), 53 deletions(-) diff --git a/torch/fx/experimental/accelerator_partitioner.py b/torch/fx/experimental/accelerator_partitioner.py index 43ec348d45e6..a995a58c5774 100644 --- a/torch/fx/experimental/accelerator_partitioner.py +++ b/torch/fx/experimental/accelerator_partitioner.py @@ -10,9 +10,8 @@ PartitionMode class DAGNode(): - """ - DAGNode class maintains useful information for a partition (submodule). - inputs(submodule node) and outputs(submodule node). + """DAGNode class maintains useful information for a partition (submodule), + and its input submodules and output submodules. """ def __init__( self, @@ -48,7 +47,7 @@ def create_node( self.nodes.append(node) class PartitionResult(NamedTuple): - """NameTuple used for returning DAG and a new graph module + """NameTuple used for returning DAG and a new fx module """ dag: DAG module_with_submodules: GraphModule @@ -73,7 +72,6 @@ def combine_two_partitions( partitions.append(partition) partitions.remove(partition_0) partitions.remove(partition_1) - # Reorganize partitions reorganize_partitions(partitions) return @@ -92,7 +90,7 @@ def set_parents_and_children(partitions: List[Partition]) -> None: # For each node in the current partition, find its users users = node.users for n in users: - # Find which the partition the user belongs to. + # Find which the partition the user node belongs to. # Note that if the node itself is also belongs to that partition, # that partition is not the child of the current partition for p in partitions: @@ -103,7 +101,7 @@ def set_parents_and_children(partitions: List[Partition]) -> None: def reorganize_partitions(partitions: List[Partition]) -> None: """Given a list of partitions, reorganzie partiton id, - its parents and its children for each partition + its parents and its children for each partition """ # Rearrange partition ids for i, partition in enumerate(partitions): @@ -123,7 +121,7 @@ def get_bfs_level_partition(partitions: List[Partition]) -> None: current_level.add(partition) next_level: Set[Partition] = set() level = 0 - # Start bfs + # bfs while current_level: partition = current_level.pop() partition.bfs_level = level @@ -149,7 +147,7 @@ def get_node_to_partition_mapping(partitions: List[Partition]) -> Dict[Node, int def get_device_to_partitions_mapping(partitions: List[Partition], devices: List[Device]): """Given a list of partitions and a list of devices, - map each partition into a device. + map each partition into a device. """ def calculate_extra_mem_bytes_needed_for(partition: Partition, partitions: List[Partition]): all_nodes: Set[Node] = set() @@ -165,10 +163,10 @@ def calculate_extra_mem_bytes_needed_for(partition: Partition, partitions: List[ def find_device_for(partition: Partition): """Given a partition, find a logical device for the partition - The algorithm is that: - #1. sort all the devices based on left mem size - #2. put the partition on the device that has just enought mem - for that partition + The algorithm is to put the partition on the device + that has just enough mem left for that partition. + device_to_left_mem_bytes is a dictionary between device and its left mem size + sorted by its left mem size """ for d in device_to_left_mem_bytes: extra_size_needed = calculate_extra_mem_bytes_needed_for(partition, device_to_partitions[d]) @@ -188,8 +186,8 @@ def find_device_for(partition: Partition): logical_id_to_device[d.logical_id] = d device_to_partitions[d] = [] device_to_left_mem_bytes[d] = d.available_mem_bytes - # Deal with the partitions that have a device - # Find all no device partitions + # Deal with the partitions that already have a device + # and also collect all partitions without a device (no_device_partitions) no_device_partitions = [] for partition in partitions: if partition.logical_device_ids != []: @@ -199,7 +197,7 @@ def find_device_for(partition: Partition): device_to_left_mem_bytes[device] = d.available_mem_bytes - partition.used_mem_bytes else: no_device_partitions.append(partition) - # Find device for each no device partition + # Find devices for all the partitions without a device found_device = True for partition in no_device_partitions: device_to_left_mem_bytes = { @@ -212,6 +210,9 @@ def find_device_for(partition: Partition): return found_device def check_dependency(partition): + """Given a partition,check if there is a circular dependency on + this partition using bfs + """ visited: Set[Partition] = set([partition]) queue: List[Partition] = [partition] while queue: @@ -226,13 +227,13 @@ def check_dependency(partition): return False class Partitioner: - """A graph module may not fit into one device. - Partitioner class helps cut one graph into subgraphs (partitions), - so that each partition could fit into a different device. - The main function of this class is self.partition_graph. - It will partition the graph based on the scheme specified in partition_config - A DAG structure is returned - along with a new graph module with partitions as submodule nodes. + """A fx module may not fit into one device. + Partitioner class helps partition one fx module into submodules (partitions), + so that the submodules can be executed crossing different accelerators. + The main function of this class is self.partition_graph. + It partitions the fx module based on the scheme specified in partition_config + A DAG structure is returned + along with a new fx module with submodule nodes. """ def __init__(self) -> None: self.partitions: List[Partition] = [] @@ -245,37 +246,40 @@ def partition_graph( torch_module: torch.nn.Module, partitioner_config: PartitionerConfig ) -> PartitionResult: - """ - Given the fx module, torch module and partitioner_config, - find the partitions, do the partitions, - and then return a DAG and a new fx module with submodule nodes (partitions) + """Given the fx module, torch module and partitioner_config, + find the partitions, do the partitions, + and then return a DAG and a new fx module with submodule nodes (partitions) """ self.graph_module = fx_module self.torch_module = torch_module self.devices = partitioner_config.devices if len(self.devices) == 0: raise RuntimeError('No devices') - # Check if there are op nodes in the graph + # Check if there are op nodes in the fx module nodes = self.graph_module.graph.nodes if all(node.op in {'placeholder', 'get_attr', 'output'} for node in nodes): raise RuntimeError('No Partition since no operations in the module') - # Calculate total size of the graph + # Calculate total size of the fx module total_size_of_graph = 0 for node in nodes: if node.op == 'output': break total_size_of_graph += node.size_bytes.total_size + # Find the device with the max mem size device_with_max_mem = max(self.devices, key=lambda d: d.available_mem_bytes) + # AOT based partition if partitioner_config.mode == PartitionMode.aot_based: self.aot_based_partition( partitioner_config.node_to_partition_mapping, partitioner_config.partition_to_logical_device_mapping ) + # Single partition if the whole module can be fit into one device elif total_size_of_graph <= device_with_max_mem.available_mem_bytes: self.find_single_partition(total_size_of_graph) elif total_size_of_graph > sum([d.available_mem_bytes for d in self.devices]): raise RuntimeError('Devices have no enough memory for the module') else: + # Sparse nn based partition if partitioner_config.mode == PartitionMode.sparse_nn: available_mem_bytes = self.devices[0].available_mem_bytes if not all(device.available_mem_bytes == available_mem_bytes for device in self.devices): @@ -283,11 +287,13 @@ def partition_graph( # sparse_nn_partition only support same memory size # TODO: add different size support for sparse_nn_partition self.sparse_nn_partition(available_mem_bytes) + # Cost aware partition elif partitioner_config.mode == PartitionMode.cost_aware: self.cost_aware_partition( partitioner_config.transfer_rate_bytes_per_sec, partitioner_config.node_to_latency_mapping ) + # KL based partition elif partitioner_config.mode == PartitionMode.kl_based: self.kl_based_partition( partitioner_config.transfer_rate_bytes_per_sec, @@ -303,7 +309,8 @@ def partition_graph( return ret def find_single_partition(self, total_size_of_graph) -> None: - """Only one partition (one graph on one device).""" + """Fit the whole fx module into one device + """ partition_0 = self.create_partition() for node in self.graph_module.graph.nodes: if node.op == 'output': @@ -316,18 +323,18 @@ def find_single_partition(self, total_size_of_graph) -> None: return def size_based_partition(self) -> None: - """This method is to partition the graph based on memory size. + """This method is to partition the fx module based on memory size. It uses greedy approach. The result may not be the best. The basic idea is: Step 1: - Find a device which has enough memory to fit the first node, create a empty partition + Find a device which has enough memory to fit the current node, create a empty partition with the size of that device. Then keep adding the following nodes into the partition until the partition is full. Step 2: Repeat Step 1 until no device left Step 3: If some nodes are left, create a partition for each left node (single node partition). - and then try to map those partitions into logical devices with non single node partitions. + and then try to map those partitions into logical devices with enough mem left. """ def find_device_based_on_size(node) -> Device: """Given a node, this function is to find a logical device @@ -365,16 +372,18 @@ def find_device_based_on_size(node) -> Device: partition.logical_device_ids.append(device.logical_id) else: # The current partition is not the first partition - # Check if the current node can fit into this partition + # Check if the current node can fit into current partition if partition_to_left_mem_bytes[partition] < total_size_of_input_nodes: # Check if no device is left if len(self.partitions) == len(self.devices): - # No device left, all the partitions before are non single node partitions + # No device is left + # Put the previous partitions into a list (non_single_node_partitions) non_single_node_partitions = self.partitions[:] # Create the first single node partition for the current node self.create_single_node_partition(node) continue # Some devices are still left + # Create a new partition with a mem size that is enough for the current node device = find_device_based_on_size(node) partition = self.create_partition() total_size_of_input_nodes = get_extra_size_of(node, partition.nodes) @@ -382,7 +391,7 @@ def find_device_based_on_size(node) -> Device: partition.logical_device_ids.append(device.logical_id) partition.add_node(node) partition_to_left_mem_bytes[partition] -= total_size_of_input_nodes - # No device left, create single node partitions + # Create single node partitions if no device is left else: self.create_single_node_partition(node) reorganize_partitions(self.partitions) @@ -395,7 +404,7 @@ def find_device_based_on_size(node) -> Device: return def do_partition(self) -> GraphModule: - """Return a module with submodules (partitions).""" + """Return a new fx module with submodule nodes (partitions).""" module_with_submodules = split_module( self.graph_module, self.torch_module, @@ -404,6 +413,7 @@ def do_partition(self) -> GraphModule: return module_with_submodules def dump_dag(self, module_with_submodules: GraphModule) -> DAG: + """Return the dag structure and the new fx module with submodules""" dag = DAG() for node in module_with_submodules.graph.nodes: if node.op == 'output': @@ -437,19 +447,21 @@ def create_partition(self) -> Partition: return partition def create_single_node_partition(self, node): - """Create a partition for a single node - """ + """Create a partition for a single node""" partition = self.create_partition() partition.add_node(node) return def sparse_nn_partition(self, available_mem_bytes: int) -> None: """This method partition a sparse nn module. - It first traverse all the nodes and do the partitions based on memory size. + It is size based partition but different from size_based_partition, + it only works when all the devices have same memory size (available_mem_bytes). + In the future, devices with different mem sizes will be supported like size_based_partition. + It first traverse all the nodes and do the partitions based on the same memory size. If the current partition has no enough memory left for a new op node (call_module, call_method, call_function), a new partition is created. - Different from size_based_partition, when traversing cross the boundary between - non-embedding nodes and embedding nodes, a new partition is created regardlessly. + When crossing the boundary between non-embedding nodes and embedding nodes, + a new partition is created regardlessly. For example, if the current node is a non-embedding node but the next node is an embedding node, a new partition is created for the next node. After the partition, the partitions are combined as much as possible. @@ -470,7 +482,7 @@ def combine_partitions_based_on_size(partitions: List[Partition], available_mem_ We go from the largest and selection partition_0. Check the bfs level for two partitions, if the level difference is less than 2, it can be combined. - Then repeat step 1. + step 2: repeat step 1 until no partitions can be combined """ find_combination = True while find_combination: @@ -518,6 +530,9 @@ def find_partition_to_combine_based_on_size( return find_combination, partitions def reset_partition_in_sparse_nn(partition, new_partition=True): + """If crossing the boudary between non-embedding nodes and + embedding nodes, create a new partition + """ if in_embedding_region: embedding_partitions.append(partition) else: @@ -604,9 +619,9 @@ def cost_aware_partition( node_to_latency_mapping: Dict[Node, NodeLatency] ) -> None: """This method is to partition the fx module based on the cost. - The cost is the total latency of running the whole graph. + The cost is the total latency of running the whole fx module. In partitioner_utils.py, the cost model is built. - The algorithm is: + The cost aware partition algorithm is: #1. At every begining, each node is a partition. Then we map all the partitions to the devices and calculate the cost @@ -623,7 +638,7 @@ def try_combining_partitions( p1_index, partitions ) -> float: - """Given two partitions and a list of partitions, try to combine these two partitions + """Given two partitions and a list of partitions, combine these two partitions and see what is the cost of the modified partition list """ p0 = partitions[p0_index] @@ -656,10 +671,10 @@ def search_combination( find two partitions to combine so the cost of the partitions can be reduced. The algorithm is : - 1. Going through all the partition pairs and see - if the pair of partitions can be combined. - 2. If they are combined, the cost is calculated. - 3. Select the minimum cost and combine its cooresponding partition pair + 1. Go through all the partition pairs and see + if any pair of partitions can be combined. + 2. Calculate the cost after the combination. + 3. Select the minimum cost and combine its cooresponding partition pair. """ partition_to_latency_mapping = get_partition_to_latency_mapping(self.partitions, node_to_latency_mapping) cost = get_latency_of_partitioned_graph(self.partitions, partition_to_latency_mapping, transfer_rate_bytes_per_sec) @@ -704,7 +719,7 @@ def search_combination( transfer_rate_bytes_per_sec, node_to_latency_mapping ) - # Make sure all partitions are set up correctly. + # Make sure all partitions are set up correctly reorganize_partitions(self.partitions) # Set up node to partition mapping self.node_to_partition = get_node_to_partition_mapping(self.partitions) @@ -725,7 +740,7 @@ def kl_based_partition( Using size_based_partition, n0 and n1 are in Partition p0. n2, n3 and n4 in Partition p1. The current cost is esimated. We first tried using n0 to swap with n2 from the other partiton. - Then we found swapping n0 and n2 shows a lower cost + Then we see that swapping n0 and n2 shows a lower cost than the current cost and it is the minimum among other pairs like (n0, None)(This means moving n0 to Partition without swapping other nodes), (n0, n3) and (n0, n4). We swap n0 and n2 and set the new cost @@ -828,7 +843,8 @@ def swap_node_to_partition(node, p0, p1, node_to_latency_mapping, transfer_rate_ node_to_latency_mapping, transfer_rate_bytes_per_sec ) - # Update cost and node pair + # Update the cost + # Track the swapped node pair and their partitions if new_cost < cost: cost = new_cost node_pair = new_node_pair From 21c04b4438a766cd998fddb42247d4eb2e010f9a Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Thu, 10 Dec 2020 07:33:54 -0800 Subject: [PATCH 121/250] make AT_FFTW_ENABLED available to fb internal Summary: follow up on D25375320 (https://github.com/pytorch/pytorch/commit/b89c32849352bf2fbb8f49749aa2fb0305a38c96). Test Plan: buck build Reviewed By: samestep Differential Revision: D25410973 fbshipit-source-id: 6c2627951a98d270d341b33538431644d03bed16 --- BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/BUILD.bazel b/BUILD.bazel index 76afe6aec1ea..5da8edc2c34e 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -544,6 +544,7 @@ header_template_rule( substitutions = { "@AT_MKLDNN_ENABLED@": "1", "@AT_MKL_ENABLED@": "0", + "@AT_FFTW_ENABLED@": "0", "@AT_NNPACK_ENABLED@": "0", "@CAFFE2_STATIC_LINK_CUDA_INT@": "0", "@USE_BLAS@": "1", From 3384145418205b8554b3e0a7c4bbb5c15f5e2333 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Thu, 10 Dec 2020 09:22:59 -0800 Subject: [PATCH 122/250] [te] Add BitCast to the IR Summary: Adds BitCasting to NNC. This will enable fast approximation algorithms implemented directly in TensorExpressions Test Plan: buck test mode/no-gpu //caffe2/test/cpp/tensorexpr:tensorexpr Reviewed By: bertmaher Differential Revision: D25441716 fbshipit-source-id: c97b871697bc5931d09cda4a9cb0a81bb420f4e2 --- test/cpp/tensorexpr/test_llvm.cpp | 83 ++++++++++++++++ test/cpp/tensorexpr/test_type.cpp | 110 +++++++++++++++++++++ torch/csrc/jit/tensorexpr/eval.h | 57 +++++++++++ torch/csrc/jit/tensorexpr/expr.h | 1 + torch/csrc/jit/tensorexpr/ir.h | 29 ++++++ torch/csrc/jit/tensorexpr/ir_mutator.cpp | 9 ++ torch/csrc/jit/tensorexpr/ir_mutator.h | 2 + torch/csrc/jit/tensorexpr/ir_visitor.cpp | 3 + torch/csrc/jit/tensorexpr/ir_visitor.h | 2 + torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 20 ++++ torch/csrc/jit/tensorexpr/loopnest.cpp | 8 ++ 11 files changed, 324 insertions(+) diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 953c184de1fc..c1d3392fff32 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -160,6 +160,63 @@ TEST(LLVM, ByteToDoubleCastTest) { ASSERT_EQ(cg.value(), 2); } +TEST(LLVM, BitCast) { + constexpr int16_t ref16 = 1337; + constexpr int32_t ref32 = 1337; + constexpr int64_t ref64 = 1337; + at::Half reff16 = 1337.0f; + constexpr float reff32 = 1337.0f; + constexpr double reff64 = 1337.0f; + + // this is broken + /*{ + KernelScope kernel_scope; + at::Half k_; + at::Half* k = &k_; + *reinterpret_cast(k) = ref16; + auto a = HalfImm::make(k); + auto b = BitCast::make(kShort, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), ref16); + }*/ + + { + KernelScope kernel_scope; + float k = raw_bitcast(ref32); + auto a = FloatImm::make(k); + auto b = BitCast::make(kInt, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), ref32); + } + + { + KernelScope kernel_scope; + double k = raw_bitcast(ref64); + auto a = DoubleImm::make(k); + auto b = BitCast::make(kLong, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), ref64); + } + + { + KernelScope kernel_scope; + int64_t k = raw_bitcast(reff64); + auto a = LongImm::make(k); + auto b = BitCast::make(kDouble, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), reff64); + } + + { + KernelScope kernel_scope; + int32_t k = raw_bitcast(reff32); + auto a = IntImm::make(k); + auto b = BitCast::make(kFloat, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), reff32); + } +} + TEST(LLVM, LetTest01) { KernelScope kernel_scope; @@ -514,6 +571,32 @@ TEST(LLVM, VectorizerLoadStoreTest) { assertAllEqual(c_vec, 21); } +TEST(LLVM, VectorizeBitCast) { + KernelScope kernel_scope; + Placeholder a(BufHandle("A", {128}, kInt)); + + Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { + return bitcast(a.load(i)); + }); + + Placeholder c_buf(BufHandle(c->buf())); + LoopNest l({c}); + Stmt* s = l.root_stmt(); + l.vectorize(dynamic_cast(s)->front()); + ASSERT_TRUE(dynamic_cast(dynamic_cast(s)->front()) == nullptr); + + LLVMCodeGen cg(s, {a, c_buf}); + + std::vector a_vec(128); + std::vector c_vec(128); + for (auto i = 0; i < 128; ++i) { + a_vec[i] = raw_bitcast(1337.f); + } + std::vector args({a_vec.data(), c_vec.data()}); + ASSERT_EQ(cg.value(args), 0); + assertAllEqual(c_vec, 1337.f); +} + TEST(LLVM, MemcpyTest) { KernelScope kernel_scope; constexpr int N = 32; diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp index 0c771733d935..71ad0f5149ac 100644 --- a/test/cpp/tensorexpr/test_type.cpp +++ b/test/cpp/tensorexpr/test_type.cpp @@ -1,5 +1,6 @@ #include +#include "torch/csrc/jit/tensorexpr/eval.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/tensor.h" @@ -42,6 +43,115 @@ TEST(Type, Test01) { } } +TEST(Type, BitCasting) { + { + KernelScope kernel_scope; + VarHandle x("x", kFloat); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kInt); + } + { + KernelScope kernel_scope; + VarHandle x("x", kInt); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kFloat); + } + { + KernelScope kernel_scope; + VarHandle x("x", kShort); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kHalf); + } + { + KernelScope kernel_scope; + VarHandle x("x", kHalf); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kShort); + } + + constexpr int16_t ref16 = 1337; + constexpr int32_t ref32 = 1337; + constexpr int64_t ref64 = 1337; + at::Half reff16 = 1337.0f; + constexpr float reff32 = 1337.0f; + constexpr double reff64 = 1337.0f; + using SimpleIRExprEval = ExprEval; + // this is broken + /*{ + KernelScope kernel_scope; + at::Half k_; + at::Half* k = &k_; + *reinterpret_cast(k) = ref16; + auto a = HalfImm::make(*k); + auto b = BitCast::make(kShort, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), ref16); + }*/ + + { + KernelScope kernel_scope; + float k = raw_bitcast(ref32); + auto a = FloatImm::make(k); + auto b = BitCast::make(kInt, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), ref32); + } + + { + KernelScope kernel_scope; + double k = raw_bitcast(ref64); + auto a = DoubleImm::make(k); + auto b = BitCast::make(kLong, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), ref64); + } + + { + KernelScope kernel_scope; + int64_t k = raw_bitcast(reff64); + auto a = LongImm::make(k); + auto b = BitCast::make(kDouble, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), reff64); + } + + { + KernelScope kernel_scope; + int32_t k = raw_bitcast(reff32); + auto a = IntImm::make(k); + auto b = BitCast::make(kFloat, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), reff32); + } + + // This segfaults :( + /*{ + KernelScope kernel_scope; + VarHandle x("x", kDouble); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kFloat); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kLong); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kShort); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kInt); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + }*/ +} + TEST(Type, Propagation) { // Same types: { diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index 7b8a4c194782..4db71c61f9f9 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -124,6 +124,14 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) { return lhs / rhs; } +template +constexpr To raw_bitcast(const From& from) noexcept { + TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation"); + To ret; + memcpy(&ret, &from, sizeof(From)); + return ret; +} + class SimpleIREvaluator : public CodeGen, public IRVisitor { public: template @@ -573,6 +581,55 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor { } } + template + std::vector bitcastValues(const Dtype& src_dtype, const Value& v) { + const std::vector& src_values = v.as_vec(); + std::vector dst_values(src_values.size()); + for (int i = 0; i < src_dtype.lanes(); ++i) { + dst_values[i] = raw_bitcast(src_values[i]); + } + return dst_values; + } + + template + void doBitCastFromSrc( + const Dtype& src_dtype, + const Dtype& dst_dtype, + const Value& v) { + switch (dst_dtype.scalar_type()) { +#define DST_TYPE_CASE(Type, Name) \ + case ScalarType::Name: \ + this->value_ = Value(bitcastValues(src_dtype, v)); \ + break; + AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DST_TYPE_CASE); +#undef DST_TYPE_CASE + default: + throw unsupported_dtype(); + } + } + + TORCH_API void visit(const BitCast* v) override { + const Expr* src_value = v->src_value(); + src_value->accept(this); + Dtype dst_dtype = v->dtype(); + Dtype src_dtype = src_value->dtype(); + if (src_dtype.byte_size() != dst_dtype.byte_size()) { + throw malformed_input("lane mismatch in Cast", v); + } + if (src_dtype != dst_dtype) { + switch (src_dtype.scalar_type()) { +#define SRC_TYPE_CASE(Type, Name) \ + case ScalarType::Name: \ + doBitCastFromSrc(src_dtype, dst_dtype, value_); \ + break; + AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, SRC_TYPE_CASE); +#undef SRC_TYPE_CASE + default: + throw unsupported_dtype(); + } + } + } + TORCH_API void visit(const For* v) override { const Expr* var_node = v->var(); v->start()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 9b8dd23db0b1..cd05333656c0 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -31,6 +31,7 @@ enum IRNodeType { kCompareSelect, kLet, kCast, + kBitCast, kBroadcast, kRamp, kPolynomial, diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 7eeea564a6a7..6fe4bf0e2ebd 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -28,6 +28,7 @@ inline int getPrecedence(IRNodeType ty) { case kPrimitive: return 0; case kCast: + case kBitCast: return 2; case kAdd: case kSub: @@ -81,6 +82,34 @@ ExprHandle cast(const ExprHandle& src_value) { return Cast::make(Dtype(ToDtype(), src_value.dtype().lanes()), src_value); } +// This is a bitwise cast, akin to bitcast in LLVM +class BitCast : public ExprNode { + public: + const Expr* src_value() const { + return src_value_; + } + static ExprHandle make(Dtype dtype, const ExprHandle& src_value) { + return ExprHandle(new BitCast(dtype, src_value.node())); + } + BitCast(Dtype dtype, const Expr* src_value) + : ExprNodeBase(dtype, kBitCast), src_value_(src_value) { + TORCH_CHECK(src_value_->dtype().byte_size() == dtype.byte_size()); + } + + bool isConstant() const override { + return src_value_->isConstant(); + } + + private: + const Expr* src_value_; +}; + +template +ExprHandle bitcast(const ExprHandle& src_value) { + return BitCast::make( + Dtype(ToDtype(), src_value.dtype().lanes()), src_value); +} + // Represent the expression node for binary operators. // A CRTP pattern to share common code among the operators. template diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index 5f0889842b1e..ddbe88bb2c8f 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -139,6 +139,15 @@ const Expr* IRMutator::mutate(const Cast* v) { return new Cast(v->dtype(), src_value_new); } +const Expr* IRMutator::mutate(const BitCast* v) { + const Expr* src_value = v->src_value(); + const Expr* src_value_new = src_value->accept_mutator(this); + if (src_value_new == v->src_value()) { + return v; + } + return new BitCast(v->dtype(), src_value_new); +} + const Expr* IRMutator::mutate(const Var* v) { return v; } diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h index 0913da0e972d..773920cb52fa 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.h +++ b/torch/csrc/jit/tensorexpr/ir_mutator.h @@ -26,6 +26,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE); #undef IMM_DECLARE class Cast; +class BitCast; class Var; class Buf; class Ramp; @@ -75,6 +76,7 @@ class TORCH_API IRMutator { AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE); #undef IMM_MUTATE_DECLARE virtual const Expr* mutate(const Cast* v); + virtual const Expr* mutate(const BitCast* v); virtual const Expr* mutate(const Var* v); virtual const Expr* mutate(const Buf* v); virtual const Expr* mutate(const Ramp* v); diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp index ae97a6200d8b..772a28c77add 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp +++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp @@ -79,6 +79,9 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT); void IRVisitor::visit(const Cast* v) { v->src_value()->accept(this); } +void IRVisitor::visit(const BitCast* v) { + v->src_value()->accept(this); +} void IRVisitor::visit(const Var* v) {} void IRVisitor::visit(const Ramp* v) { diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h index 3f5f05229c16..8353da680edb 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.h +++ b/torch/csrc/jit/tensorexpr/ir_visitor.h @@ -26,6 +26,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE) #undef IMM_DECLARE class Cast; +class BitCast; class Var; class Buf; class Ramp; @@ -74,6 +75,7 @@ class TORCH_API IRVisitor { #undef IMM_PRINT_VISIT virtual void visit(const Cast* v); + virtual void visit(const BitCast* v); virtual void visit(const Var* v); virtual void visit(const Buf* v); virtual void visit(const Ramp* v); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index cb14b9ef4c07..d469a39cf69d 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -164,6 +164,7 @@ class LLVMCodeGenImpl : public IRVisitor { #undef IMM_VISIT_DECLARE void visit(const Cast* v) override; + void visit(const BitCast* v) override; void visit(const Var* v) override; void visit(const Ramp* v) override; void visit(const Load* v) override; @@ -888,6 +889,25 @@ void LLVMCodeGenImpl::visit(const Cast* v) { } } +void LLVMCodeGenImpl::visit(const BitCast* v) { + v->src_value()->accept(this); + + llvm::Type* dstType = dtypeToLLVM(v->dtype()); + if (v->dtype().lanes() > 1) { + dstType = llvm::VectorType::get(dstType, ElementCount(v->dtype().lanes())); + } + llvm::Type* srcType = dtypeToLLVM(v->src_value()->dtype()); + + if (srcType == dstType) { + // do nothing. + return; + } + + TORCH_CHECK(llvm::CastInst::isBitCastable( + srcType->getScalarType(), dstType->getScalarType())); + value_ = irb_.CreateBitOrPointerCast(value_, dstType); +} + void LLVMCodeGenImpl::visit(const Var* v) { if (varToArg_.count(v)) { auto idx = varToArg_.at(v); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 96df28625bec..a70cb99638e6 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -154,6 +154,14 @@ class Vectorizer : public IRMutator { }); } + const Expr* mutate(const BitCast* v) override { + std::vector inputs = {v->src_value()}; + return try_vectorize(v, inputs, [&]() { + return BitCast::make( + Dtype(v->dtype().scalar_type(), lanes_), ExprHandle(inputs[0])); + }); + } + const Expr* mutate(const Cast* v) override { std::vector inputs = {v->src_value()}; return try_vectorize(v, inputs, [&]() { From 195b92bfa6b171118f7c22a6d8f1e15e41af450a Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Thu, 10 Dec 2020 09:27:56 -0800 Subject: [PATCH 123/250] Revert D25441716: [te] Add BitCast to the IR Test Plan: revert-hammer Differential Revision: D25441716 (https://github.com/pytorch/pytorch/commit/3384145418205b8554b3e0a7c4bbb5c15f5e2333) Original commit changeset: c97b871697bc fbshipit-source-id: e6eff02e28e1ae8c826dd2cfed79f869839ed2ba --- test/cpp/tensorexpr/test_llvm.cpp | 83 ---------------- test/cpp/tensorexpr/test_type.cpp | 110 --------------------- torch/csrc/jit/tensorexpr/eval.h | 57 ----------- torch/csrc/jit/tensorexpr/expr.h | 1 - torch/csrc/jit/tensorexpr/ir.h | 29 ------ torch/csrc/jit/tensorexpr/ir_mutator.cpp | 9 -- torch/csrc/jit/tensorexpr/ir_mutator.h | 2 - torch/csrc/jit/tensorexpr/ir_visitor.cpp | 3 - torch/csrc/jit/tensorexpr/ir_visitor.h | 2 - torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 20 ---- torch/csrc/jit/tensorexpr/loopnest.cpp | 8 -- 11 files changed, 324 deletions(-) diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index c1d3392fff32..953c184de1fc 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -160,63 +160,6 @@ TEST(LLVM, ByteToDoubleCastTest) { ASSERT_EQ(cg.value(), 2); } -TEST(LLVM, BitCast) { - constexpr int16_t ref16 = 1337; - constexpr int32_t ref32 = 1337; - constexpr int64_t ref64 = 1337; - at::Half reff16 = 1337.0f; - constexpr float reff32 = 1337.0f; - constexpr double reff64 = 1337.0f; - - // this is broken - /*{ - KernelScope kernel_scope; - at::Half k_; - at::Half* k = &k_; - *reinterpret_cast(k) = ref16; - auto a = HalfImm::make(k); - auto b = BitCast::make(kShort, a); - LLVMExprEval cg(b); - ASSERT_EQ(cg.value(), ref16); - }*/ - - { - KernelScope kernel_scope; - float k = raw_bitcast(ref32); - auto a = FloatImm::make(k); - auto b = BitCast::make(kInt, a); - LLVMExprEval cg(b); - ASSERT_EQ(cg.value(), ref32); - } - - { - KernelScope kernel_scope; - double k = raw_bitcast(ref64); - auto a = DoubleImm::make(k); - auto b = BitCast::make(kLong, a); - LLVMExprEval cg(b); - ASSERT_EQ(cg.value(), ref64); - } - - { - KernelScope kernel_scope; - int64_t k = raw_bitcast(reff64); - auto a = LongImm::make(k); - auto b = BitCast::make(kDouble, a); - LLVMExprEval cg(b); - ASSERT_EQ(cg.value(), reff64); - } - - { - KernelScope kernel_scope; - int32_t k = raw_bitcast(reff32); - auto a = IntImm::make(k); - auto b = BitCast::make(kFloat, a); - LLVMExprEval cg(b); - ASSERT_EQ(cg.value(), reff32); - } -} - TEST(LLVM, LetTest01) { KernelScope kernel_scope; @@ -571,32 +514,6 @@ TEST(LLVM, VectorizerLoadStoreTest) { assertAllEqual(c_vec, 21); } -TEST(LLVM, VectorizeBitCast) { - KernelScope kernel_scope; - Placeholder a(BufHandle("A", {128}, kInt)); - - Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { - return bitcast(a.load(i)); - }); - - Placeholder c_buf(BufHandle(c->buf())); - LoopNest l({c}); - Stmt* s = l.root_stmt(); - l.vectorize(dynamic_cast(s)->front()); - ASSERT_TRUE(dynamic_cast(dynamic_cast(s)->front()) == nullptr); - - LLVMCodeGen cg(s, {a, c_buf}); - - std::vector a_vec(128); - std::vector c_vec(128); - for (auto i = 0; i < 128; ++i) { - a_vec[i] = raw_bitcast(1337.f); - } - std::vector args({a_vec.data(), c_vec.data()}); - ASSERT_EQ(cg.value(args), 0); - assertAllEqual(c_vec, 1337.f); -} - TEST(LLVM, MemcpyTest) { KernelScope kernel_scope; constexpr int N = 32; diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp index 71ad0f5149ac..0c771733d935 100644 --- a/test/cpp/tensorexpr/test_type.cpp +++ b/test/cpp/tensorexpr/test_type.cpp @@ -1,6 +1,5 @@ #include -#include "torch/csrc/jit/tensorexpr/eval.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/tensor.h" @@ -43,115 +42,6 @@ TEST(Type, Test01) { } } -TEST(Type, BitCasting) { - { - KernelScope kernel_scope; - VarHandle x("x", kFloat); - ExprHandle y = bitcast(x); - ASSERT_EQ(y.dtype(), kInt); - } - { - KernelScope kernel_scope; - VarHandle x("x", kInt); - ExprHandle y = bitcast(x); - ASSERT_EQ(y.dtype(), kFloat); - } - { - KernelScope kernel_scope; - VarHandle x("x", kShort); - ExprHandle y = bitcast(x); - ASSERT_EQ(y.dtype(), kHalf); - } - { - KernelScope kernel_scope; - VarHandle x("x", kHalf); - ExprHandle y = bitcast(x); - ASSERT_EQ(y.dtype(), kShort); - } - - constexpr int16_t ref16 = 1337; - constexpr int32_t ref32 = 1337; - constexpr int64_t ref64 = 1337; - at::Half reff16 = 1337.0f; - constexpr float reff32 = 1337.0f; - constexpr double reff64 = 1337.0f; - using SimpleIRExprEval = ExprEval; - // this is broken - /*{ - KernelScope kernel_scope; - at::Half k_; - at::Half* k = &k_; - *reinterpret_cast(k) = ref16; - auto a = HalfImm::make(*k); - auto b = BitCast::make(kShort, a); - SimpleIRExprEval cg(b); - ASSERT_EQ(cg.value(), ref16); - }*/ - - { - KernelScope kernel_scope; - float k = raw_bitcast(ref32); - auto a = FloatImm::make(k); - auto b = BitCast::make(kInt, a); - SimpleIRExprEval cg(b); - ASSERT_EQ(cg.value(), ref32); - } - - { - KernelScope kernel_scope; - double k = raw_bitcast(ref64); - auto a = DoubleImm::make(k); - auto b = BitCast::make(kLong, a); - SimpleIRExprEval cg(b); - ASSERT_EQ(cg.value(), ref64); - } - - { - KernelScope kernel_scope; - int64_t k = raw_bitcast(reff64); - auto a = LongImm::make(k); - auto b = BitCast::make(kDouble, a); - SimpleIRExprEval cg(b); - ASSERT_EQ(cg.value(), reff64); - } - - { - KernelScope kernel_scope; - int32_t k = raw_bitcast(reff32); - auto a = IntImm::make(k); - auto b = BitCast::make(kFloat, a); - SimpleIRExprEval cg(b); - ASSERT_EQ(cg.value(), reff32); - } - - // This segfaults :( - /*{ - KernelScope kernel_scope; - VarHandle x("x", kDouble); - ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); - } - { - KernelScope kernel_scope; - VarHandle x("x", kFloat); - ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); - } - { - KernelScope kernel_scope; - VarHandle x("x", kLong); - ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); - } - { - KernelScope kernel_scope; - VarHandle x("x", kShort); - ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); - } - { - KernelScope kernel_scope; - VarHandle x("x", kInt); - ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); - }*/ -} - TEST(Type, Propagation) { // Same types: { diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index 4db71c61f9f9..7b8a4c194782 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -124,14 +124,6 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) { return lhs / rhs; } -template -constexpr To raw_bitcast(const From& from) noexcept { - TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation"); - To ret; - memcpy(&ret, &from, sizeof(From)); - return ret; -} - class SimpleIREvaluator : public CodeGen, public IRVisitor { public: template @@ -581,55 +573,6 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor { } } - template - std::vector bitcastValues(const Dtype& src_dtype, const Value& v) { - const std::vector& src_values = v.as_vec(); - std::vector dst_values(src_values.size()); - for (int i = 0; i < src_dtype.lanes(); ++i) { - dst_values[i] = raw_bitcast(src_values[i]); - } - return dst_values; - } - - template - void doBitCastFromSrc( - const Dtype& src_dtype, - const Dtype& dst_dtype, - const Value& v) { - switch (dst_dtype.scalar_type()) { -#define DST_TYPE_CASE(Type, Name) \ - case ScalarType::Name: \ - this->value_ = Value(bitcastValues(src_dtype, v)); \ - break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, DST_TYPE_CASE); -#undef DST_TYPE_CASE - default: - throw unsupported_dtype(); - } - } - - TORCH_API void visit(const BitCast* v) override { - const Expr* src_value = v->src_value(); - src_value->accept(this); - Dtype dst_dtype = v->dtype(); - Dtype src_dtype = src_value->dtype(); - if (src_dtype.byte_size() != dst_dtype.byte_size()) { - throw malformed_input("lane mismatch in Cast", v); - } - if (src_dtype != dst_dtype) { - switch (src_dtype.scalar_type()) { -#define SRC_TYPE_CASE(Type, Name) \ - case ScalarType::Name: \ - doBitCastFromSrc(src_dtype, dst_dtype, value_); \ - break; - AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, SRC_TYPE_CASE); -#undef SRC_TYPE_CASE - default: - throw unsupported_dtype(); - } - } - } - TORCH_API void visit(const For* v) override { const Expr* var_node = v->var(); v->start()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index cd05333656c0..9b8dd23db0b1 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -31,7 +31,6 @@ enum IRNodeType { kCompareSelect, kLet, kCast, - kBitCast, kBroadcast, kRamp, kPolynomial, diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 6fe4bf0e2ebd..7eeea564a6a7 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -28,7 +28,6 @@ inline int getPrecedence(IRNodeType ty) { case kPrimitive: return 0; case kCast: - case kBitCast: return 2; case kAdd: case kSub: @@ -82,34 +81,6 @@ ExprHandle cast(const ExprHandle& src_value) { return Cast::make(Dtype(ToDtype(), src_value.dtype().lanes()), src_value); } -// This is a bitwise cast, akin to bitcast in LLVM -class BitCast : public ExprNode { - public: - const Expr* src_value() const { - return src_value_; - } - static ExprHandle make(Dtype dtype, const ExprHandle& src_value) { - return ExprHandle(new BitCast(dtype, src_value.node())); - } - BitCast(Dtype dtype, const Expr* src_value) - : ExprNodeBase(dtype, kBitCast), src_value_(src_value) { - TORCH_CHECK(src_value_->dtype().byte_size() == dtype.byte_size()); - } - - bool isConstant() const override { - return src_value_->isConstant(); - } - - private: - const Expr* src_value_; -}; - -template -ExprHandle bitcast(const ExprHandle& src_value) { - return BitCast::make( - Dtype(ToDtype(), src_value.dtype().lanes()), src_value); -} - // Represent the expression node for binary operators. // A CRTP pattern to share common code among the operators. template diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index ddbe88bb2c8f..5f0889842b1e 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -139,15 +139,6 @@ const Expr* IRMutator::mutate(const Cast* v) { return new Cast(v->dtype(), src_value_new); } -const Expr* IRMutator::mutate(const BitCast* v) { - const Expr* src_value = v->src_value(); - const Expr* src_value_new = src_value->accept_mutator(this); - if (src_value_new == v->src_value()) { - return v; - } - return new BitCast(v->dtype(), src_value_new); -} - const Expr* IRMutator::mutate(const Var* v) { return v; } diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h index 773920cb52fa..0913da0e972d 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.h +++ b/torch/csrc/jit/tensorexpr/ir_mutator.h @@ -26,7 +26,6 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE); #undef IMM_DECLARE class Cast; -class BitCast; class Var; class Buf; class Ramp; @@ -76,7 +75,6 @@ class TORCH_API IRMutator { AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE); #undef IMM_MUTATE_DECLARE virtual const Expr* mutate(const Cast* v); - virtual const Expr* mutate(const BitCast* v); virtual const Expr* mutate(const Var* v); virtual const Expr* mutate(const Buf* v); virtual const Expr* mutate(const Ramp* v); diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp index 772a28c77add..ae97a6200d8b 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp +++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp @@ -79,9 +79,6 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT); void IRVisitor::visit(const Cast* v) { v->src_value()->accept(this); } -void IRVisitor::visit(const BitCast* v) { - v->src_value()->accept(this); -} void IRVisitor::visit(const Var* v) {} void IRVisitor::visit(const Ramp* v) { diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h index 8353da680edb..3f5f05229c16 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.h +++ b/torch/csrc/jit/tensorexpr/ir_visitor.h @@ -26,7 +26,6 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE) #undef IMM_DECLARE class Cast; -class BitCast; class Var; class Buf; class Ramp; @@ -75,7 +74,6 @@ class TORCH_API IRVisitor { #undef IMM_PRINT_VISIT virtual void visit(const Cast* v); - virtual void visit(const BitCast* v); virtual void visit(const Var* v); virtual void visit(const Buf* v); virtual void visit(const Ramp* v); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index d469a39cf69d..cb14b9ef4c07 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -164,7 +164,6 @@ class LLVMCodeGenImpl : public IRVisitor { #undef IMM_VISIT_DECLARE void visit(const Cast* v) override; - void visit(const BitCast* v) override; void visit(const Var* v) override; void visit(const Ramp* v) override; void visit(const Load* v) override; @@ -889,25 +888,6 @@ void LLVMCodeGenImpl::visit(const Cast* v) { } } -void LLVMCodeGenImpl::visit(const BitCast* v) { - v->src_value()->accept(this); - - llvm::Type* dstType = dtypeToLLVM(v->dtype()); - if (v->dtype().lanes() > 1) { - dstType = llvm::VectorType::get(dstType, ElementCount(v->dtype().lanes())); - } - llvm::Type* srcType = dtypeToLLVM(v->src_value()->dtype()); - - if (srcType == dstType) { - // do nothing. - return; - } - - TORCH_CHECK(llvm::CastInst::isBitCastable( - srcType->getScalarType(), dstType->getScalarType())); - value_ = irb_.CreateBitOrPointerCast(value_, dstType); -} - void LLVMCodeGenImpl::visit(const Var* v) { if (varToArg_.count(v)) { auto idx = varToArg_.at(v); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index a70cb99638e6..96df28625bec 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -154,14 +154,6 @@ class Vectorizer : public IRMutator { }); } - const Expr* mutate(const BitCast* v) override { - std::vector inputs = {v->src_value()}; - return try_vectorize(v, inputs, [&]() { - return BitCast::make( - Dtype(v->dtype().scalar_type(), lanes_), ExprHandle(inputs[0])); - }); - } - const Expr* mutate(const Cast* v) override { std::vector inputs = {v->src_value()}; return try_vectorize(v, inputs, [&]() { From d5c4a80cfdc0b253c43d0a198f26abec9e35cdce Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Thu, 10 Dec 2020 09:53:27 -0800 Subject: [PATCH 124/250] Allow ROCm CI to use non-default stream. (#48424) Summary: Revert https://github.com/pytorch/pytorch/issues/26394. Fixes https://github.com/pytorch/pytorch/issues/27356. Not all MIOpen handles were setting their stream to the current stream prior to running the op. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48424 Reviewed By: H-Huang Differential Revision: D25420384 Pulled By: mruberry fbshipit-source-id: 051683ba9e3d264b71162bd344031a0c58bf6a41 --- aten/src/ATen/miopen/Handle.cpp | 62 ++++++++++++------- aten/src/ATen/miopen/Utils.h | 6 -- .../ATen/native/miopen/BatchNorm_miopen.cpp | 2 - aten/src/ATen/native/miopen/Conv_miopen.cpp | 9 --- aten/src/ATen/native/miopen/RNN_miopen.cpp | 4 -- test/test_cuda.py | 1 - torch/testing/_internal/common_utils.py | 2 +- 7 files changed, 39 insertions(+), 47 deletions(-) diff --git a/aten/src/ATen/miopen/Handle.cpp b/aten/src/ATen/miopen/Handle.cpp index 8965ef5a2cce..6b8c7c6421c4 100644 --- a/aten/src/ATen/miopen/Handle.cpp +++ b/aten/src/ATen/miopen/Handle.cpp @@ -1,39 +1,53 @@ -#include - #include - -#include -#include +#include +#include +#include namespace at { namespace native { - namespace { -struct Handle { - miopenHandle_t handle; - Handle() : handle(NULL) { - MIOPEN_CHECK(miopenCreate(&handle)); - } - ~Handle() { - if (handle) { - miopenDestroy(handle); - } - } -}; +void createMIOpenHandle(miopenHandle_t *handle) { + MIOPEN_CHECK(miopenCreate(handle)); +} -std::mutex mutex; -std::unordered_map handles; +void destroyMIOpenHandle(miopenHandle_t handle) { +// this is because of something dumb in the ordering of +// destruction. Sometimes atexit, the cuda context (or something) +// would already be destroyed by the time this gets destroyed. It +// happens in fbcode setting. @colesbury and I decided to not destroy +// the handle as a workaround. +// - @soumith +// +// Further note: this is now disabled globally, because we are seeing +// the same issue as mentioned above in CUDA 11 CI. +// - @zasdfgbnm +// +// #ifdef NO_MIOPEN_DESTROY_HANDLE +// #else +// miopenDestroy(handle); +// #endif +} -} // namespace +using MIOpenPoolType = at::cuda::DeviceThreadHandlePool; +} // namespace -miopenHandle_t getMiopenHandle() -{ +miopenHandle_t getMiopenHandle() { int device; HIP_CHECK(hipGetDevice(&device)); - std::lock_guard guard(mutex); - return handles[device].handle; + // Thread local PoolWindows are lazily-initialized + // to avoid initialization issues that caused hangs on Windows. + // See: https://github.com/pytorch/pytorch/pull/22405 + // This thread local unique_ptrs will be destroyed when the thread terminates, + // releasing its reserved handles back to the pool. + static auto pool = std::make_shared(); + thread_local std::unique_ptr myPoolWindow( + pool->newPoolWindow()); + + auto handle = myPoolWindow->reserve(device); + MIOPEN_CHECK(miopenSetStream(handle, at::hip::getCurrentHIPStream())); + return handle; } }} // namespace at::native diff --git a/aten/src/ATen/miopen/Utils.h b/aten/src/ATen/miopen/Utils.h index 90ee4b7a14ee..5952e4f4c796 100644 --- a/aten/src/ATen/miopen/Utils.h +++ b/aten/src/ATen/miopen/Utils.h @@ -7,12 +7,6 @@ namespace at { namespace native { -inline void setMIOpenStreamToCurrent() { - // NB: Due to in-place HIPify, getCurrentCUDAStream actually means - // getCurrentHIPStream - MIOPEN_CHECK(miopenSetStream(getMiopenHandle(), at::hip::getCurrentHIPStream())); -} - // This function makes tensors which have zero stride contiguous, by // setting the strides to 1. inline Tensor contiguousIfZeroInStrides(const Tensor& t) { diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index 0d4af95c7a76..92473ecc68c8 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -62,7 +62,6 @@ std::tuple miopen_batch_norm( running_mean{ running_mean_t, "running_mean", 4 }, running_var{ running_var_t, "running_var", 5 }; CheckedFrom c = "miopen_batch_norm"; - setMIOpenStreamToCurrent(); checkAllDefined(c, {input, weight, bias}); if (!training) { @@ -151,7 +150,6 @@ std::tuple miopen_batch_norm_backward( save_mean{ save_mean_t, "save_mean", 4 }, save_var{ save_var_t, "save_var", 5 }; CheckedFrom c = "miopen_batch_norm_backward"; - setMIOpenStreamToCurrent(); checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var}); diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 27e119d377bc..f0b0d6fdd5b7 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -624,7 +624,6 @@ Tensor miopen_convolution( TensorArg input { input_t, "input", 1 }, weight { weight_t, "weight", 2 }, bias { bias_t, "bias", 3 }; - setMIOpenStreamToCurrent(); CheckedFrom c = "miopen_convolution"; auto output_t = miopen_convolution_forward( c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); @@ -699,7 +698,6 @@ Tensor miopen_depthwise_convolution( TensorArg input { input_t, "input", 1 }, weight { weight_t, "weight", 2 }, bias { bias_t, "bias", 3 }; - setMIOpenStreamToCurrent(); CheckedFrom c = "miopen_depthwise_convolution"; auto output_t = miopen_depthwise_convolution_forward( c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); @@ -716,7 +714,6 @@ Tensor miopen_convolution_transpose_backward_input( { TensorArg grad_output { grad_output_t, "grad_output", 1 }, weight { weight_t, "weight", 2 }; - setMIOpenStreamToCurrent(); return miopen_convolution_forward( "miopen_convolution_transpose_backward_input", grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); @@ -827,7 +824,6 @@ Tensor miopen_convolution_backward_input( { TensorArg grad_output{ grad_output_t, "grad_output", 1 }, weight{ weight_t, "weight", 2 }; - setMIOpenStreamToCurrent(); return miopen_convolution_backward_input( "miopen_convolution_backward_input", input_size, grad_output, weight, @@ -897,7 +893,6 @@ Tensor miopen_depthwise_convolution_backward_input( { TensorArg grad_output{ grad_output_t, "grad_output", 1 }, weight{ weight_t, "weight", 2 }; - setMIOpenStreamToCurrent(); return miopen_depthwise_convolution_backward_input( "miopen_depthwise_convolution_backward_input", input_size, grad_output, weight, @@ -1087,7 +1082,6 @@ Tensor miopen_convolution_backward_weight( { TensorArg grad_output{ grad_output_t, "grad_output", 1 }, input{ input_t, "input", 2 }; - setMIOpenStreamToCurrent(); return miopen_convolution_backward_weight( "miopen_convolution_backward_weight", weight_size, grad_output, input, @@ -1103,7 +1097,6 @@ Tensor miopen_convolution_transpose_backward_weight( { TensorArg grad_output{ grad_output_t, "grad_output", 1 }, input{ input_t, "input", 2 }; - setMIOpenStreamToCurrent(); return miopen_convolution_backward_weight( "miopen_convolution_backward_weight", weight_size, input, grad_output, @@ -1119,7 +1112,6 @@ Tensor miopen_depthwise_convolution_backward_weight( { TensorArg grad_output{ grad_output_t, "grad_output", 1 }, input{ input_t, "input", 2 }; - setMIOpenStreamToCurrent(); return miopen_depthwise_convolution_backward_weight( "miopen_depthwise_convolution_backward_weight", weight_size, grad_output, input, @@ -1136,7 +1128,6 @@ Tensor miopen_convolution_backward_bias( const Tensor& grad_output_t) { TensorArg grad_output{ grad_output_t, "grad_output", 1 }; - setMIOpenStreamToCurrent(); auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp index 1493cece3212..10b535f890ac 100644 --- a/aten/src/ATen/native/miopen/RNN_miopen.cpp +++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp @@ -509,7 +509,6 @@ std::tuple miopen_rnn( size_t reserver_size; MIOPEN_CHECK(miopenGetRNNTrainingReserveSize(handle, descs.rnn_desc.desc(), fn.tensors.seq_length, x_descs_arr.data(), &reserver_size)); reserve = at::empty(reserver_size, input.options().dtype(kByte)); - setMIOpenStreamToCurrent(); MIOPEN_CHECK(miopenRNNForwardTraining(handle, descs.rnn_desc.desc(), fn.tensors.seq_length, x_descs_arr.data(), x.data_ptr(), descs.hx_desc.desc(), hx.data_ptr(), @@ -521,7 +520,6 @@ std::tuple miopen_rnn( workspace.data_ptr(), workspace_size, reserve.data_ptr(), reserver_size )); } else { //Inference. reserve = at::empty({0}, input.options().dtype(kByte)); - setMIOpenStreamToCurrent(); MIOPEN_CHECK(miopenRNNForwardInference(handle, descs.rnn_desc.desc(), fn.tensors.seq_length, x_descs_arr.data(), x.data_ptr(), descs.hx_desc.desc(), hx.data_ptr(), @@ -630,7 +628,6 @@ std::tuple miopen_rnn_backward_input( )); auto workspace = at::empty(workspace_size, input.options().dtype(kByte)); - setMIOpenStreamToCurrent(); MIOPEN_CHECK(miopenRNNBackwardData( handle, descs.rnn_desc.desc(), @@ -715,7 +712,6 @@ std::vector miopen_rnn_backward_weight( auto x_descs_arr = descs.get_x_descs(); auto y_descs_arr = descs.get_y_descs(); - setMIOpenStreamToCurrent(); MIOPEN_CHECK(miopenRNNBackwardWeights( handle, descs.rnn_desc.desc(), diff --git a/test/test_cuda.py b/test/test_cuda.py index 2a5754523876..6249c250ae2e 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -2421,7 +2421,6 @@ def _worker(t): self.assertEqual(results[t].sum().item(), size * size) @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') - @skipIfRocm def test_cudnn_multiple_threads_same_device(self): # This function is intended to test the lazy creation and reuse of per-thread # cudnn handles on each device in aten/src/ATen/cudnn/Handles.cpp. diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index cf997ddb894b..80041a1c69de 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -831,7 +831,7 @@ def __init__(self, method_name='runTest'): # Wraps the tested method if we should enforce non default CUDA stream. self._do_cuda_non_default_stream &= getattr(test_method, '_do_cuda_non_default_stream', True) - if self._do_cuda_non_default_stream and not IS_WINDOWS and not TEST_WITH_ROCM: + if self._do_cuda_non_default_stream and not IS_WINDOWS: self.wrap_with_cuda_policy(method_name, self.enforceNonDefaultStream) def assertLeaksNoCudaTensors(self, name=None): From 45473ffe239349005b125f1f4ec7a55d79093f88 Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Thu, 10 Dec 2020 10:00:53 -0800 Subject: [PATCH 125/250] Refactor cudnn convolution (#49109) Summary: cuDNN v7 API has been deprecated, so we need to migrate to cuDNN v8 API. The v8 API does not exist on cuDNN 7, so there will be a long time both API should exist. This is step 0 of adding cuDNN v8 API. There is no real code change in this PR. It just copy-pastes existing code. The original `Conv.cpp` is split into `ConvPlaceholders.cpp`, `ConvShared.cpp`, `ConvShared.h`, `Conv_v7.cpp`, `Conv_v8.cpp`. Currently `Conv_v8.cpp` is empty, and will be filled in the future. The `ConvPlaceholders.cpp` contains placeholder implementation of cudnn convolution when cudnn is not enabled. These operators only raise errors and do no real computation. This file also contains deprecated operators. These operators are implemented using current operators. The `ConvShared.cpp` and `ConvShared.h` contains code that will be shared by the v7 and v8 API, these include the definition of struct `ConvolutionParams` and `ConvolutionArgs`. As well as ATen exposed API like `cudnn_convolution` and intermediate `cudnn_convolution_forward`. These exposed functions will call raw API like `raw_cudnn_convolution_forward_out` in `Conv_v7.cpp` or `Conv_v8.cpp` for the real implementation. The `Conv_v7.cpp`, `Conv_v8.cpp` contains the implementation of raw APIs, and are different for v7 and v8. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49109 Reviewed By: H-Huang Differential Revision: D25463783 Pulled By: ezyang fbshipit-source-id: 1c80de8e5d94d97a61e45687f6193e8ff5481e3e --- .../ATen/native/cudnn/ConvPlaceholders.cpp | 147 +++++ aten/src/ATen/native/cudnn/ConvShared.cpp | 500 ++++++++++++++ aten/src/ATen/native/cudnn/ConvShared.h | 88 +++ .../native/cudnn/{Conv.cpp => Conv_v7.cpp} | 620 +----------------- aten/src/ATen/native/cudnn/Conv_v8.cpp | 5 + 5 files changed, 746 insertions(+), 614 deletions(-) create mode 100644 aten/src/ATen/native/cudnn/ConvPlaceholders.cpp create mode 100644 aten/src/ATen/native/cudnn/ConvShared.cpp create mode 100644 aten/src/ATen/native/cudnn/ConvShared.h rename aten/src/ATen/native/cudnn/{Conv.cpp => Conv_v7.cpp} (54%) create mode 100644 aten/src/ATen/native/cudnn/Conv_v8.cpp diff --git a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp new file mode 100644 index 000000000000..bac8df92a5fc --- /dev/null +++ b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp @@ -0,0 +1,147 @@ +#include // for the definition of AT_CUDNN_ENABLED +#include +#include + +namespace at { namespace native { + +// --------------------------------------------------------------------- +// +// Placeholder operators +// +// --------------------------------------------------------------------- + +#if !AT_CUDNN_ENABLED() + +// See Note [ATen preprocessor philosophy] + +at::Tensor cudnn_convolution( + const at::Tensor& input, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_backward_input( + IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_backward_weight( + IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); +} + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { + AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_transpose( + const at::Tensor& input, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_transpose_backward_input( + const at::Tensor& grad_output, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); +} + +at::Tensor cudnn_convolution_transpose_backward_weight( + IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); +} + +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { + AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); +} + +void raw_cudnn_convolution_forward_out( + const Tensor& output, const Tensor& input, const Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("raw_cudnn_convolution_forward_out: ATen not compiled with cuDNN support"); +} + +void raw_cudnn_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("raw_cudnn_convolution_backward_input_out: ATen not compiled with cuDNN support"); +} + +void raw_cudnn_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) { + AT_ERROR("raw_cudnn_convolution_backward_weight_out: ATen not compiled with cuDNN support"); +} + +#endif // AT_CUDNN_ENABLED + +// --------------------------------------------------------------------- +// +// Deprecated operators +// +// --------------------------------------------------------------------- + +// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future +Tensor cudnn_convolution_deprecated( + const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic) { + auto output = at::cudnn_convolution(input, weight, padding, stride, dilation, groups, benchmark, deterministic); + if (bias.defined()) { + output = output + reshape_bias(input.dim(), bias); + } + return output; +} + +// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future +Tensor cudnn_convolution_deprecated2( + const Tensor& input_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + return at::cudnn_convolution(input_t, weight_t, padding, stride, dilation, groups, benchmark, deterministic, at::globalContext().allowTF32CuDNN()); +} + +// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future +Tensor cudnn_convolution_transpose_deprecated( + const Tensor& input, const Tensor& weight, const Tensor& bias /* optional */, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + auto output = at::cudnn_convolution_transpose(input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic); + if (bias.defined()) { + output = output + reshape_bias(input.dim(), bias); + } + return output; +} + +// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future +Tensor cudnn_convolution_transpose_deprecated2( + const Tensor& input_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic) +{ + return at::cudnn_convolution_transpose(input_t, weight_t, padding, output_padding, stride, dilation, groups, benchmark, deterministic, at::globalContext().allowTF32CuDNN()); +} + +}} diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp new file mode 100644 index 000000000000..e360008e2707 --- /dev/null +++ b/aten/src/ATen/native/cudnn/ConvShared.cpp @@ -0,0 +1,500 @@ +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include + +// NOTE [cuDNN API version] +// +// ConvPlaceholders.cpp contains placeholder implementation of cudnn +// convolution when cudnn is not enabled. These operators only raises +// errors, and do no real computation. This file also contains deprecated +// operators. These operators are implemented using currnet operators. +// +// cuDNN v7 and v8 have different API. ConvShared.{cpp, h} contains +// code shared by v7 and v8. Conv_v7.cpp contains implementation of +// convolution using cuDNN v7 API. Conv_v8.cpp contains implementation +// with v8 API. +// +// NOTE [ Convolution design ] +// +// cuDNN convolutions does not handle bias. Bias is handled outside. +// +// The general strategy: +// +// - cudnn_convolution (Tensor) +// Entry points for clients +// +// - cudnn_convolution_forward (TensorArg) +// Entry point, which may be reused between regular +// convolution and transposed convolution. +// +// - raw_cudnn_convolution_forward_out (Tensor) +// Function that has different implementation on Conv_v7.cpp +// and Conv_v8.cpp +// +// The raw API directly invokes CuDNN and are implemeted differently +// on cuDNN v7 and cuDNN v8 +// +// There are a few reasons this should never be directly exposed +// via ATen: +// +// - It takes output as a parameter (this should be computed!) +// - It doesn't do input checking +// - It doesn't resize output (it is assumed to be correctly sized) +// +// Where does argument checking happen? Here's the division of +// responsibility: +// - Things that happen in at::Tensor +// - TensorArg allocation +// - Things that happen in TensorArg +// - Check arguments (type, GPU, shape) + +namespace at { namespace native { + +// --------------------------------------------------------------------- +// +// ConvolutionParams and ConvolutionArgs +// +// --------------------------------------------------------------------- + +std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) { + out << "ConvolutionParams \n" + << " data_type = " << cudnnTypeToString(params.dataType) << "\n" + << " padding = " << ArrayRef{params.padding} << "\n" + << " stride = " << ArrayRef{params.stride} << "\n" + << " dilation = " << ArrayRef{params.dilation} << "\n" + << " groups = " << params.groups << "\n" + << " deterministic = " << (params.deterministic ? "true" : "false") << "\n" + << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n"; + + return out; +} + +// NB: This can't be a constructor, because then ConvolutionParams +// would not be a POD anymore. +// TODO: Use TensorGeometry here instead of the entire Tensor, which we +// don't actually need. (OTOH: We can always pass in +// grad_input/grad_output, so this is not very pressing) +void setConvolutionParams( + ConvolutionParams* params, + const at::Tensor& input, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool deterministic, bool allow_tf32) { + + cudnnDataType_t dataType = getCudnnDataType(input); + memset(params, 0, sizeof(ConvolutionParams)); + params->dataType = dataType; + // ASSERT(weight.dim() == input.dim()) + for (int i = 0; i != input.dim(); ++i) { + params->input_size[i] = (int) input.size(i); + params->input_stride[i] = (int) input.stride(i); + params->weight_size[i] = (int) weight.size(i); + } + // ASSERT(padding.size() == stride.size()) + // ASSERT(padding.size() == dilation.size()) + for (size_t i = 0; i != padding.size(); ++i) { + params->padding[i] = padding[i]; + params->stride[i] = stride[i]; + params->dilation[i] = dilation[i]; + } + // In principle, we shouldn't parametrize by groups for legacy + // CuDNN, but it doesn't seem worth the effort to actually do this. + params->groups = groups; + params->deterministic = deterministic; + params->allow_tf32 = allow_tf32; +} + +std::string repro_from_args(const ConvolutionArgs& args) { + auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; }; + std::string partial_dtype; + switch (args.params.dataType) { + case CUDNN_DATA_FLOAT: partial_dtype = "float"; break; + case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break; + case CUDNN_DATA_HALF: partial_dtype = "half"; break; + default: partial_dtype = "unsupported"; + } + const std::string full_dtype = "torch." + partial_dtype; + const int out_channels = args.weight.sizes()[0]; + const int in_channels = args.weight.sizes()[1] * args.params.groups; + const size_t dim = args.input.sizes().size(); + const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d"; + const std::string to_channels_last = args.input.suggest_memory_format() == at::MemoryFormat::ChannelsLast \ + ? ".to(memory_format=torch." + channels_last_xd + ")" : ""; + + std::ostringstream ss; + ss << "You can try to repro this exception using the following code snippet. "; + ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n"; + ss << "import torch\n"; + ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n"; + ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; + ss << "torch.backends.cudnn.deterministic = " << pybool(args.params.deterministic) << "\n"; + ss << "torch.backends.cudnn.allow_tf32 = " << pybool(args.params.allow_tf32) << "\n"; + ss << "data = torch.randn(" << args.input.sizes() << ", dtype=" << full_dtype << ", "; + ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n"; + ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", "; + ss << "kernel_size=" << args.weight.sizes().slice(2) << ", "; + ss << "padding=" << ArrayRef(args.params.padding, dim-2) << ", "; + ss << "stride=" << ArrayRef(args.params.stride, dim-2) << ", "; + ss << "dilation=" << ArrayRef(args.params.dilation, dim-2) << ", "; + ss << "groups=" << args.params.groups << ")\n"; + ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n"; + ss << "out = net(data)\n"; + ss << "out.backward(torch.randn_like(out))\n"; + ss << "torch.cuda.synchronize()\n\n"; + + return ss.str(); +} + +std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) { + out << repro_from_args(args) // already has a trailing newline + << args.params // already has a trailing newline + << "input: " << args.idesc // already has a trailing newline + << "output: " << args.odesc // already has a trailing newline + << "weight: " << args.wdesc // already has a trailing newline + << "Pointer addresses: " << "\n" + << " input: " << args.input.data_ptr() << "\n" + << " output: " << args.output.data_ptr() << "\n" + << " weight: " << args.weight.data_ptr() << "\n"; + + return out; +} + +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) +{ + TORCH_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + TORCH_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ")"; + AT_ERROR(ss.str()); + } +} + + +// NOTE [ Convolution checks ] +// +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +static void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + +// --------------------------------------------------------------------- +// +// Convolution forward / Transposed convolution backward +// +// --------------------------------------------------------------------- + +Tensor cudnn_convolution_forward( + CheckedFrom c, + const TensorArg& input, const TensorArg& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + checkAllSameType(c, {input, weight}); + checkAllSameGPU(c, {input, weight}); + + auto layout = cudnn_conv_use_channels_last(*input, *weight) ? + at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + auto output_t = at::empty( + conv_output_size(input->sizes(), weight->sizes(), + padding, stride, dilation), + input->options(), + layout); + + if (output_t.numel() == 0) { + return output_t; + } + + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{ output_t, "result", 0 }; + convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); + + // See #4500 + Tensor weight_contig = weight->contiguous(layout); + // Make sure that NC11 strides follow formula + weight_contig.resize_(weight_contig.sizes(), layout); + Tensor input_contig = input->contiguous(layout); + input_contig.resize_(input_contig.sizes(), layout); + + raw_cudnn_convolution_forward_out( + *output, input_contig, weight_contig, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + + return *output; +} + +Tensor cudnn_convolution( + const Tensor& input_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) +{ + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + CheckedFrom c = "cudnn_convolution"; + auto output_t = cudnn_convolution_forward( + c, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + return output_t; +} + +// NB: output_padding not needed here, as there is no ambiguity to +// resolve +Tensor cudnn_convolution_transpose_backward_input( + const Tensor& grad_output_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) +{ + TensorArg grad_output { grad_output_t, "grad_output", 1 }, + weight { weight_t, "weight", 2 }; + return cudnn_convolution_forward( + "cudnn_convolution_transpose_backward_input", + grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); +} + +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); + + Tensor grad_input, grad_weight; + if (output_mask[0]) { + grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + } + if (output_mask[1]) { + grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + } + + return std::tuple{grad_input, grad_weight}; +} + +// --------------------------------------------------------------------- +// +// Convolution backward / Transposed convolution forward +// +// --------------------------------------------------------------------- + +// NOTE [ Backward vs transpose convolutions ] +// +// Backward and transpose are algorithmically equivalent, but they +// compute their geometry differently. In a backwards, you knew what +// the original size of the input tensor was, so you can cache that +// geometry and fill it directly. In transposed convolution, it is +// more conventional to not explicitly specify the output (previously +// input) size, and compute it. This, however, leaves a degree of +// freedom; this degree of freedom is resolved using the +// output_padding parameter. Both of these interfaces are equivalent, +// but they are differently convenient depending on the use case. + +Tensor cudnn_convolution_backward_input( + CheckedFrom c, + IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + checkAllSameType(c, {grad_output, weight}); + checkAllSameGPU(c, {grad_output, weight}); + + auto layout = cudnn_conv_use_channels_last(*grad_output, *weight) ? + at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + auto grad_input_t = at::empty(input_size, grad_output->options(), layout); + + // Avoid "grad_input" when this is being used as transposed convolution + TensorArg grad_input{ grad_input_t, "result", 0 }; + convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + + // See #4500 + Tensor weight_contig = weight->contiguous(layout); + // Make sure that NC11 strides follow formula + weight_contig.resize_(weight_contig.sizes(), layout); + + Tensor grad_output_contig = grad_output->contiguous(layout); + grad_output_contig.resize_(grad_output_contig.sizes(), layout); + + raw_cudnn_convolution_backward_input_out( + *grad_input, grad_output_contig, weight_contig, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + + return *grad_input; +} + +Tensor cudnn_convolution_transpose_forward( + CheckedFrom c, + const TensorArg& grad_output, const TensorArg& weight, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), + padding, output_padding, stride, dilation, groups); + return cudnn_convolution_backward_input(c, input_size, grad_output, weight, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); +} + +Tensor cudnn_convolution_backward_input( + IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, + weight{ weight_t, "weight", 2 }; + return cudnn_convolution_backward_input( + "cudnn_convolution_backward_input", + input_size, grad_output, weight, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); +} + +std::tuple cudnn_convolution_backward( + const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { + + Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); + + Tensor grad_input, grad_weight; + if (input.numel() == 0) { + if (output_mask[0]) { + grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (output_mask[1]) { + grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + } else { + if (output_mask[0]) { + grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + } + if (output_mask[1]) { + grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + } + } + + return std::tuple{grad_input, grad_weight}; +} + +Tensor cudnn_convolution_transpose( + const Tensor& input_t, const Tensor& weight_t, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) +{ + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + CheckedFrom c = "cudnn_convolution_transpose"; + auto output_t = cudnn_convolution_transpose_forward( + c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + return output_t; +} + +// --------------------------------------------------------------------- +// +// Convolution backward (weight) +// +// --------------------------------------------------------------------- + +Tensor cudnn_convolution_backward_weight( + CheckedFrom c, + IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + auto layout = cudnn_conv_use_channels_last(input_t, grad_output_t) ? + at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; + + Tensor grad_output_contig_t = grad_output_t.contiguous(layout); + // Make sure that NC11 strides follow formula + grad_output_contig_t.resize_(grad_output_contig_t.sizes(), layout); + TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; + + Tensor input_contig_t = input_t.contiguous(layout); + input_contig_t.resize_(input_contig_t.sizes(), layout); + TensorArg input{ input_contig_t, "input", 2}; + + checkAllSameType(c, {grad_output_contig, input}); + checkAllSameGPU(c, {grad_output_contig, input}); + + auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), layout); + + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{ grad_weight_t, "result", 0 }; + convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); + + raw_cudnn_convolution_backward_weight_out( + *grad_weight, *grad_output_contig, *input, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + + return grad_weight_t; +} + +Tensor cudnn_convolution_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + return cudnn_convolution_backward_weight( + "cudnn_convolution_backward_weight", + weight_size, grad_output_t, input_t, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); +} + +Tensor cudnn_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32) +{ + return cudnn_convolution_backward_weight( + "cudnn_convolution_backward_weight", + weight_size, input_t, grad_output_t, + padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); +} + +}} + +#endif // AT_CUDNN_ENABLED diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h new file mode 100644 index 000000000000..e30b5c7be581 --- /dev/null +++ b/aten/src/ATen/native/cudnn/ConvShared.h @@ -0,0 +1,88 @@ +#include + +#include +#include +#include +#include + +namespace at { namespace native { + +// --------------------------------------------------------------------- +// +// Helper classes +// +// --------------------------------------------------------------------- + +// This POD struct is used to let us easily compute hashes of the +// parameters +struct ConvolutionParams +{ + cudnnDataType_t dataType; + int input_size[2 + max_dim]; + int input_stride[2 + max_dim]; + int weight_size[2 + max_dim]; + int padding[max_dim]; + int stride[max_dim]; + int dilation[max_dim]; + int64_t groups; + bool deterministic; + bool allow_tf32; + // NB: transposed purposely omitted: transposed just swaps + // forward and backward, so you can reuse the benchmark entry, +}; + +// Convenience struct for passing around descriptors and data +// pointers +struct ConvolutionArgs { + cudnnHandle_t handle; + ConvolutionParams params; + TensorDescriptor idesc, odesc; + FilterDescriptor wdesc; + const Tensor& input, output, weight; + ConvolutionDescriptor cdesc; + + ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) { + } +}; + +std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params); + +// NB: This can't be a constructor, because then ConvolutionParams +// would not be a POD anymore. +// TODO: Use TensorGeometry here instead of the entire Tensor, which we +// don't actually need. (OTOH: We can always pass in +// grad_input/grad_output, so this is not very pressing) +void setConvolutionParams( + ConvolutionParams* params, + const at::Tensor& input, const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + int64_t groups, bool deterministic, bool allow_tf32); + +std::string repro_from_args(const ConvolutionArgs& args); + +std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args); + +// --------------------------------------------------------------------- +// +// Raw functions +// +// --------------------------------------------------------------------- + +void raw_cudnn_convolution_forward_out( + const Tensor& output, const Tensor& input, const Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32); + +void raw_cudnn_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32); + +void raw_cudnn_convolution_backward_weight_out( + const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, + bool benchmark, bool deterministic, bool allow_tf32); + +}} diff --git a/aten/src/ATen/native/cudnn/Conv.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp similarity index 54% rename from aten/src/ATen/native/cudnn/Conv.cpp rename to aten/src/ATen/native/cudnn/Conv_v7.cpp index 4524af2fe244..5e1f124f1185 100644 --- a/aten/src/ATen/native/cudnn/Conv.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -1,3 +1,7 @@ +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + #include #include #include @@ -5,80 +9,10 @@ #include #include #include -#include #include -#include - -#if !AT_CUDNN_ENABLED() - -namespace at { namespace native { - -// See Note [ATen preprocessor philosophy] - -at::Tensor cudnn_convolution( - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support"); -} - -at::Tensor cudnn_convolution_backward_input( - IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); -} - -at::Tensor cudnn_convolution_backward_weight( - IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); -} - -std::tuple cudnn_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support"); -} - -at::Tensor cudnn_convolution_transpose( - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); -} - -at::Tensor cudnn_convolution_transpose_backward_input( - const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); -} - -at::Tensor cudnn_convolution_transpose_backward_weight( - IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); -} - -std::tuple cudnn_convolution_transpose_backward( - const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); -} - -}} - -#else // AT_CUDNN_ENABLED +#include #include - -#include -#include #include #include #include @@ -130,217 +64,6 @@ namespace at { namespace native { // TODO: Go through all the checking code again and make sure // we haven't missed anything. -// TODO: Move this into the standard library, with a better name? -Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { - auto group_size = t.size(dim) / groups; - return t.narrow(dim, group_idx * group_size, group_size); -} - -// --------------------------------------------------------------------- -// -// Checking -// -// --------------------------------------------------------------------- - -// Note [Legacy CuDNN grouped convolution support] -// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// CuDNN earlier than CuDNN 7 does not directly support group -// convolution, so we provide support for it by sequentially -// running a convolution per group with appropriately -// adjusted sizes. https://blog.yani.io/filter-group-tutorial/ -// has a fairly good diagram explaining how it works. - -// Used on pad, stride and dilation -static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) -{ - TORCH_CHECK(args.size() <= expected_size, - "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", - expected_size, " (while checking arguments for ", c, ")"); - TORCH_CHECK(args.size() >= expected_size, - "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", - expected_size, " (while checking arguments for ", c, ")"); - - auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); - if (num_negative_values > 0){ - std::stringstream ss; - ss << arg_name << " should be greater than zero but got ("; - std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); - ss << args.back() << ")" << " (while checking arguments for " << c << ")"; - AT_ERROR(ss.str()); - } -} - - -// NOTE [ Convolution checks ] -// -// NB: For many call sites, it is not strictly necessary to check all of -// these relationships (for example, for forward convolution, we compute -// the size of output ourselves, so we don't actually need to check -// output. However, writing a single function that does everything -// means we get to reuse it for both forwards and all backwards -// variants, even when the set of "real" inputs varies. The magic of -// relational computing! -// -// (There is one downside, which is that it is slightly harder to write -// error messages which are able to distinguish between real inputs -// (which the user can change) and computed inputs (which the user can -// only indirectly affect). It would be an interesting exercise to -// come up with a general framework to handle such situations.) -static void convolution_shape_check( - CheckedFrom c, - const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) -{ - check_args(c, padding, input->dim() - 2, "padding"); - check_args(c, stride, padding.size(), "stride"); - check_args(c, dilation, padding.size(), "dilation"); - - // Input - checkDimRange(c, input, 3, 6 /* exclusive */); - checkSize(c, input, input_channels_dim, weight->size(1) * groups); - - // Weight - checkSameDim(c, input, weight); - - // TODO: check that output->size() matches output_sizes - // TODO: check that weight matches output->sizes() - checkSameDim(c, input, output); -} - -// This POD struct is used to let us easily compute hashes of the -// parameters -struct ConvolutionParams -{ - cudnnDataType_t dataType; - int input_size[2 + max_dim]; - int input_stride[2 + max_dim]; - int weight_size[2 + max_dim]; - int padding[max_dim]; - int stride[max_dim]; - int dilation[max_dim]; - int64_t groups; - bool deterministic; - bool allow_tf32; - // NB: transposed purposely omitted: transposed just swaps - // forward and backward, so you can reuse the benchmark entry, -}; - -std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) { - out << "ConvolutionParams \n" - << " data_type = " << cudnnTypeToString(params.dataType) << "\n" - << " padding = " << ArrayRef{params.padding} << "\n" - << " stride = " << ArrayRef{params.stride} << "\n" - << " dilation = " << ArrayRef{params.dilation} << "\n" - << " groups = " << params.groups << "\n" - << " deterministic = " << (params.deterministic ? "true" : "false") << "\n" - << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n"; - - return out; -} - -// NB: This can't be a constructor, because then ConvolutionParams -// would not be a POD anymore. -// TODO: Use TensorGeometry here instead of the entire Tensor, which we -// don't actually need. (OTOH: We can always pass in -// grad_input/grad_output, so this is not very pressing) -void setConvolutionParams( - ConvolutionParams* params, - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool deterministic, bool allow_tf32) { - - cudnnDataType_t dataType = getCudnnDataType(input); - memset(params, 0, sizeof(ConvolutionParams)); - params->dataType = dataType; - // ASSERT(weight.dim() == input.dim()) - for (int i = 0; i != input.dim(); ++i) { - params->input_size[i] = (int) input.size(i); - params->input_stride[i] = (int) input.stride(i); - params->weight_size[i] = (int) weight.size(i); - } - // ASSERT(padding.size() == stride.size()) - // ASSERT(padding.size() == dilation.size()) - for (size_t i = 0; i != padding.size(); ++i) { - params->padding[i] = padding[i]; - params->stride[i] = stride[i]; - params->dilation[i] = dilation[i]; - } - // In principle, we shouldn't parametrize by groups for legacy - // CuDNN, but it doesn't seem worth the effort to actually do this. - params->groups = groups; - params->deterministic = deterministic; - params->allow_tf32 = allow_tf32; -} - -// Convenience struct for passing around descriptors and data -// pointers -struct ConvolutionArgs { - cudnnHandle_t handle; - ConvolutionParams params; - TensorDescriptor idesc, odesc; - FilterDescriptor wdesc; - const Tensor& input, output, weight; - ConvolutionDescriptor cdesc; - - ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) { - } -}; - -std::string repro_from_args(const ConvolutionArgs& args) { - auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; }; - std::string partial_dtype; - switch (args.params.dataType) { - case CUDNN_DATA_FLOAT: partial_dtype = "float"; break; - case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break; - case CUDNN_DATA_HALF: partial_dtype = "half"; break; - default: partial_dtype = "unsupported"; - } - const std::string full_dtype = "torch." + partial_dtype; - const int out_channels = args.weight.sizes()[0]; - const int in_channels = args.weight.sizes()[1] * args.params.groups; - const size_t dim = args.input.sizes().size(); - const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d"; - const std::string to_channels_last = args.input.suggest_memory_format() == at::MemoryFormat::ChannelsLast \ - ? ".to(memory_format=torch." + channels_last_xd + ")" : ""; - - std::ostringstream ss; - ss << "You can try to repro this exception using the following code snippet. "; - ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n"; - ss << "import torch\n"; - ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n"; - ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; - ss << "torch.backends.cudnn.deterministic = " << pybool(args.params.deterministic) << "\n"; - ss << "torch.backends.cudnn.allow_tf32 = " << pybool(args.params.allow_tf32) << "\n"; - ss << "data = torch.randn(" << args.input.sizes() << ", dtype=" << full_dtype << ", "; - ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n"; - ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", "; - ss << "kernel_size=" << args.weight.sizes().slice(2) << ", "; - ss << "padding=" << ArrayRef(args.params.padding, dim-2) << ", "; - ss << "stride=" << ArrayRef(args.params.stride, dim-2) << ", "; - ss << "dilation=" << ArrayRef(args.params.dilation, dim-2) << ", "; - ss << "groups=" << args.params.groups << ")\n"; - ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n"; - ss << "out = net(data)\n"; - ss << "out.backward(torch.randn_like(out))\n"; - ss << "torch.cuda.synchronize()\n\n"; - - return ss.str(); -} - -std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) { - out << repro_from_args(args) // already has a trailing newline - << args.params // already has a trailing newline - << "input: " << args.idesc // already has a trailing newline - << "output: " << args.odesc // already has a trailing newline - << "weight: " << args.wdesc // already has a trailing newline - << "Pointer addresses: " << "\n" - << " input: " << args.input.data_ptr() << "\n" - << " output: " << args.output.data_ptr() << "\n" - << " weight: " << args.weight.data_ptr() << "\n"; - - return out; -} - // --------------------------------------------------------------------- // // Benchmarking @@ -781,18 +504,7 @@ inline Tensor allocate_workspace(size_t size, const Tensor &other) { return at::empty({static_cast(size)}, other.options().dtype(kByte)); } -// NOTE [ Convolution design ] -// -// cuDNN convolutions does not handle bias. Bias is handled outside. -// -// The general strategy: -// -// - cudnn_convolution (Tensor) -// Entry points for clients -// -// - cudnn_convolution_forward (TensorArg) -// Entry point, which may be reused between regular -// convolution and transposed convolution. +// NOTE [ raw_cudnn_convolution_forward_out ] // // - raw_cudnn_convolution_forward_out (Tensor) // Functiont that handles tensors that are too large to use 32bit indexing. @@ -802,14 +514,6 @@ inline Tensor allocate_workspace(size_t size, const Tensor &other) { // Low level function which invokes CuDNN, and takes an output // tensor which is directly written to (thus _out). // -// Where does argument checking happen? Here's the division of -// responsibility: -// - Things that happen in at::Tensor -// - TensorArg allocation -// - Things that happen in TensorArg -// - Check arguments (type, GPU, shape) -// -// TODO: Consider renaming zero-indexed arguments to "self" // --------------------------------------------------------------------- @@ -885,16 +589,6 @@ if (args.params.dataType == CUDNN_DATA_FLOAT) { // // --------------------------------------------------------------------- -// The raw API directly invokes CuDNN and does not emulate support -// for group convolution on old versions of CuDNN. -// -// There are a few reasons this should never be directly exposed -// via ATen: -// -// - It takes output as a parameter (this should be computed!) -// - It doesn't do input checking -// - It doesn't resize output (it is assumed to be correctly sized) -// void raw_cudnn_convolution_forward_out_32bit( const Tensor& output, const Tensor& input, const Tensor& weight, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, @@ -946,90 +640,6 @@ void raw_cudnn_convolution_forward_out( split_batch_dim_to_32bit_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 256, raw_cudnn_convolution_forward_out_32bit); } -Tensor cudnn_convolution_forward( - CheckedFrom c, - const TensorArg& input, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - checkAllSameType(c, {input, weight}); - checkAllSameGPU(c, {input, weight}); - - auto layout = cudnn_conv_use_channels_last(*input, *weight) ? - at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; - auto output_t = at::empty( - conv_output_size(input->sizes(), weight->sizes(), - padding, stride, dilation), - input->options(), - layout); - - if (output_t.numel() == 0) { - return output_t; - } - - // Avoid ambiguity of "output" when this is being used as backwards - TensorArg output{ output_t, "result", 0 }; - convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); - - // See #4500 - Tensor weight_contig = weight->contiguous(layout); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), layout); - Tensor input_contig = input->contiguous(layout); - input_contig.resize_(input_contig.sizes(), layout); - - raw_cudnn_convolution_forward_out( - *output, input_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - - return *output; -} - -Tensor cudnn_convolution( - const Tensor& input_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) -{ - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }; - CheckedFrom c = "cudnn_convolution"; - auto output_t = cudnn_convolution_forward( - c, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - return output_t; -} - -// NB: output_padding not needed here, as there is no ambiguity to -// resolve -Tensor cudnn_convolution_transpose_backward_input( - const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) -{ - TensorArg grad_output { grad_output_t, "grad_output", 1 }, - weight { weight_t, "weight", 2 }; - return cudnn_convolution_forward( - "cudnn_convolution_transpose_backward_input", - grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); -} - -std::tuple cudnn_convolution_transpose_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - - Tensor grad_input, grad_weight; - if (output_mask[0]) { - grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - } - if (output_mask[1]) { - grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - } - - return std::tuple{grad_input, grad_weight}; -} - // --------------------------------------------------------------------- // // Convolution backward / Transposed convolution forward @@ -1089,115 +699,6 @@ void raw_cudnn_convolution_backward_input_out( split_batch_dim_to_32bit_out(grad_input, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 128, raw_cudnn_convolution_backward_input_out_32bit); } -// NOTE [ Backward vs transpose convolutions ] -// -// Backward and transpose are algorithmically equivalent, but they -// compute their geometry differently. In a backwards, you knew what -// the original size of the input tensor was, so you can cache that -// geometry and fill it directly. In transposed convolution, it is -// more conventional to not explicitly specify the output (previously -// input) size, and compute it. This, however, leaves a degree of -// freedom; this degree of freedom is resolved using the -// output_padding parameter. Both of these interfaces are equivalent, -// but they are differently convenient depending on the use case. - -Tensor cudnn_convolution_backward_input( - CheckedFrom c, - IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - checkAllSameType(c, {grad_output, weight}); - checkAllSameGPU(c, {grad_output, weight}); - - auto layout = cudnn_conv_use_channels_last(*grad_output, *weight) ? - at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; - auto grad_input_t = at::empty(input_size, grad_output->options(), layout); - - // Avoid "grad_input" when this is being used as transposed convolution - TensorArg grad_input{ grad_input_t, "result", 0 }; - convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); - - // See #4500 - Tensor weight_contig = weight->contiguous(layout); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), layout); - - Tensor grad_output_contig = grad_output->contiguous(layout); - grad_output_contig.resize_(grad_output_contig.sizes(), layout); - - raw_cudnn_convolution_backward_input_out( - *grad_input, grad_output_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - - return *grad_input; -} - -Tensor cudnn_convolution_transpose_forward( - CheckedFrom c, - const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), - padding, output_padding, stride, dilation, groups); - return cudnn_convolution_backward_input(c, input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); -} - -Tensor cudnn_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - weight{ weight_t, "weight", 2 }; - return cudnn_convolution_backward_input( - "cudnn_convolution_backward_input", - input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); -} - -std::tuple cudnn_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - - Tensor grad_input, grad_weight; - if (input.numel() == 0) { - if (output_mask[0]) { - grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - if (output_mask[1]) { - grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - } else { - if (output_mask[0]) { - grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - } - if (output_mask[1]) { - grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - } - } - - return std::tuple{grad_input, grad_weight}; -} - -Tensor cudnn_convolution_transpose( - const Tensor& input_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) -{ - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }; - CheckedFrom c = "cudnn_convolution_transpose"; - auto output_t = cudnn_convolution_transpose_forward( - c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - return output_t; -} - // --------------------------------------------------------------------- // // Convolution backward (weight) @@ -1295,115 +796,6 @@ void raw_cudnn_convolution_backward_weight_out( TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); } -Tensor cudnn_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - auto layout = cudnn_conv_use_channels_last(input_t, grad_output_t) ? - at::MemoryFormat::ChannelsLast : at::MemoryFormat::Contiguous; - - Tensor grad_output_contig_t = grad_output_t.contiguous(layout); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), layout); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input_t.contiguous(layout); - input_contig_t.resize_(input_contig_t.sizes(), layout); - TensorArg input{ input_contig_t, "input", 2}; - - checkAllSameType(c, {grad_output_contig, input}); - checkAllSameGPU(c, {grad_output_contig, input}); - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), layout); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_cudnn_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - - return grad_weight_t; -} - -Tensor cudnn_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - return cudnn_convolution_backward_weight( - "cudnn_convolution_backward_weight", - weight_size, grad_output_t, input_t, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); -} - -Tensor cudnn_convolution_transpose_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - return cudnn_convolution_backward_weight( - "cudnn_convolution_backward_weight", - weight_size, input_t, grad_output_t, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); -} - }} // namespace at::native #endif - - -namespace at { namespace native { - -// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future -Tensor cudnn_convolution_deprecated( - const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) { - auto output = at::cudnn_convolution(input, weight, padding, stride, dilation, groups, benchmark, deterministic); - if (bias.defined()) { - output = output + reshape_bias(input.dim(), bias); - } - return output; -} - -// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future -Tensor cudnn_convolution_deprecated2( - const Tensor& input_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - return at::cudnn_convolution(input_t, weight_t, padding, stride, dilation, groups, benchmark, deterministic, at::globalContext().allowTF32CuDNN()); -} - -// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future -Tensor cudnn_convolution_transpose_deprecated( - const Tensor& input, const Tensor& weight, const Tensor& bias /* optional */, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - auto output = at::cudnn_convolution_transpose(input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic); - if (bias.defined()) { - output = output + reshape_bias(input.dim(), bias); - } - return output; -} - -// TODO (@zasdfgbnm): this is here only for compatibility, remove this in the future -Tensor cudnn_convolution_transpose_deprecated2( - const Tensor& input_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - return at::cudnn_convolution_transpose(input_t, weight_t, padding, output_padding, stride, dilation, groups, benchmark, deterministic, at::globalContext().allowTF32CuDNN()); -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp new file mode 100644 index 000000000000..53f8c37f5e64 --- /dev/null +++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp @@ -0,0 +1,5 @@ +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() && defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000 +// Coming soon +#endif // AT_CUDNN_ENABLED and CUDNN_VERSION From 25a8397bf3760d60bd754517d590f15cbd041e25 Mon Sep 17 00:00:00 2001 From: Xiong Wei Date: Thu, 10 Dec 2020 10:02:18 -0800 Subject: [PATCH 126/250] add additional interpolation modes for torch.quantile (#48711) Summary: Fix https://github.com/pytorch/pytorch/issues/48523 Related https://github.com/pytorch/pytorch/issues/38349 **BC-breaking Note:** This PR updates PyTorch's quantile function to add additional interpolation methods `lower`, `higher`, `nearest`, and `midpoint`, and these interpolation methods are currently supported by NumPy. New parameter `interpolation` is added to the signature for both `torch.quantile` and `torch.nanquantile` functions. - `quantile(input, q, dim=None, interpolation='linear', keepdim=False, *, out=None) -> Tensor` - `nanquantile(input, q, dim=None, interpolation='linear', keepdim=False, *, out=None) -> Tensor` Function signatures followed the NumPy-like style for the moment, keeping `out` at the end to be consistent with PyTorch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48711 Reviewed By: H-Huang Differential Revision: D25428587 Pulled By: heitorschueroff fbshipit-source-id: e98d24f6a651d302eb94f4ff4da18e38bdbf0124 --- aten/src/ATen/native/Sorting.cpp | 73 ++++++++++++++----- aten/src/ATen/native/native_functions.yaml | 16 ++-- .../check_backward_compatibility.py | 2 + test/test_reductions.py | 13 +++- torch/_torch_docs.py | 38 ++++++++-- .../_internal/common_methods_invocations.py | 8 +- 6 files changed, 112 insertions(+), 38 deletions(-) diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index e365d48fdffe..58bad4ab3bdd 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -91,11 +91,25 @@ void quick_select_template( } while (true); } +void copy_quantile_result(Tensor &out, + const Tensor &result, + const Tensor &q) { + if (q.dim() == 0) { + // If q is scalar, remove last dim to match out shape + result.squeeze_(-1); + } else { + // Move quantiles to first dim to match out shape + result.unsqueeze_(0).transpose_(0, -1).squeeze_(-1); + } + out.copy_(result); +} + void quantile_impl( Tensor& out, const Tensor& self, const Tensor& q, optional _dim, + const std::string interpolation, bool keepdim, bool ignore_nan) { int64_t dim = at::maybe_wrap_dim(_dim.value_or(0), self.dim(), true); @@ -117,6 +131,12 @@ void quantile_impl( TORCH_CHECK( self.device() == out.device(), "quantile() out tensor must be on the same device as the input tensor"); + std::vector interpolations{ + "linear", "lower", "higher", "midpoint", "nearest"}; + TORCH_CHECK( + std::find(interpolations.begin(), interpolations.end(), interpolation) != interpolations.end(), + "quantile() interpolation should only be ", + c10::Join(", ", interpolations), "."); // Compute output shape: q_size + reduced_size std::vector out_shape; @@ -186,23 +206,32 @@ void quantile_impl( at::broadcast_tensors({q * last_index, sorted.isnan().any(-1, true)}); ranks = at::masked_fill(tl[0], tl[1], last_index); } - Tensor ranks_below = ranks.toType(kLong); - Tensor weights = ranks - ranks_below; - Tensor ranks_above = ranks.ceil_().toType(kLong); + // adjust ranks based on the interpolation mode + if (interpolation == "lower") { + ranks.floor_(); + } else if (interpolation == "higher") { + ranks.ceil_(); + } else if (interpolation == "nearest") { + ranks.round_(); + } + + Tensor ranks_below = ranks.toType(kLong); Tensor values_below = sorted.gather(-1, ranks_below); + if (interpolation != "linear" && interpolation != "midpoint") { + copy_quantile_result(out, values_below, q); + return; + } + + // calculate weights for linear and midpoint + Tensor weights = interpolation == "midpoint" ? at::full_like(ranks, 0.5) : ranks - ranks_below; + + Tensor ranks_above = ranks.ceil_().toType(kLong); Tensor values_above = sorted.gather(-1, ranks_above); // Interpolate to compute quantiles and copy to out tensor values_below.lerp_(values_above, weights); - if (q.dim() == 0) { - // If q is scalar, remove last dim to match out shape - values_below.squeeze_(-1); - } else { - // Move quantiles to first dim to match out shape - values_below.unsqueeze_(0).transpose_(0, -1).squeeze_(-1); - } - out.copy_(values_below); + copy_quantile_result(out, values_below, q); } std::tuple kthvalue_out_impl_cpu( @@ -413,8 +442,9 @@ Tensor& quantile_out( const Tensor& self, const Tensor& q, optional _dim, + const std::string interpolation, bool keepdim) { - quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/false); + quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/false); return out; } @@ -423,6 +453,7 @@ Tensor& quantile_out( const Tensor& self, double q, optional _dim, + const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); @@ -431,6 +462,7 @@ Tensor& quantile_out( self, at::scalar_tensor(q, self.options()), std::move(_dim), + interpolation, keepdim); } @@ -438,9 +470,10 @@ Tensor quantile( const Tensor& self, const Tensor& q, optional _dim, + const std::string interpolation, bool keepdim) { Tensor out = at::empty({0}, self.options()); - quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/false); + quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/false); return out; } @@ -448,11 +481,12 @@ Tensor quantile( const Tensor& self, double q, optional _dim, + const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); return at::quantile( - self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim); + self, at::scalar_tensor(q, self.options()), std::move(_dim), interpolation, keepdim); } Tensor& nanquantile_out( @@ -460,8 +494,9 @@ Tensor& nanquantile_out( const Tensor& self, const Tensor& q, optional _dim, + const std::string interpolation, bool keepdim) { - quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/true); + quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/true); return out; } @@ -470,6 +505,7 @@ Tensor& nanquantile_out( const Tensor& self, double q, optional _dim, + const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); @@ -478,6 +514,7 @@ Tensor& nanquantile_out( self, at::scalar_tensor(q, self.options()), std::move(_dim), + interpolation, keepdim); } @@ -485,9 +522,10 @@ Tensor nanquantile( const Tensor& self, const Tensor& q, optional _dim, + const std::string interpolation, bool keepdim) { Tensor out = at::empty({0}, self.options()); - quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/true); + quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/true); return out; } @@ -495,11 +533,12 @@ Tensor nanquantile( const Tensor& self, double q, optional _dim, + const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); return at::nanquantile( - self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim); + self, at::scalar_tensor(q, self.options()), std::move(_dim), interpolation, keepdim); } std::tuple kthvalue_out_cpu( diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index cc6b0e30258e..dc6e815fb438 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6751,27 +6751,27 @@ use_c10_dispatcher: full variants: method, function -- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: quantile.scalar_out(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor +- func: quantile.scalar(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function -- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: quantile.out(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor +- func: quantile(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function -- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor +- func: nanquantile.scalar(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function -- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor +- func: nanquantile(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index ccb4a6457537..e155537d7b99 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -187,6 +187,8 @@ ("aten::ifft", datetime.date(2021, 1, 31)), ("aten::irfft", datetime.date(2021, 1, 31)), ("aten::rfft", datetime.date(2021, 1, 31)), + ("aten::quantile", datetime.date(2021, 1, 31)), + ("aten::nanquantile", datetime.date(2021, 1, 31)), ] def allow_listed(schema, allow_list): diff --git a/test/test_reductions.py b/test/test_reductions.py index 7c877d822142..eadd96652d0b 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -1787,14 +1787,17 @@ def test_quantile(self, device, dtype): numpy_op = getattr(np, op) # Compute quantile along every dimension and flattened tensor - for dim in [None] + list(range(a.ndim)): - result = torch_op(a, q, dim, keepdim) - expected = numpy_op(a.cpu().numpy(), q.cpu().numpy(), dim, keepdims=keepdim) + interpolations = ('linear', 'lower', 'higher', 'midpoint', 'nearest') + for interpolation, dim in product(interpolations, + [None] + list(range(a.ndim))): + result = torch_op(a, q, dim, interpolation, keepdim) + expected = numpy_op(a.cpu().numpy(), q.cpu().numpy(), dim, + interpolation=interpolation, keepdims=keepdim) self.assertEqual(result.cpu(), torch.from_numpy(np.array(expected)).type(result.type())) # Test out variation out = torch.empty_like(result) - torch_op(a, q, dim, keepdim, out=out) + torch_op(a, q, dim, interpolation, keepdim, out=out) self.assertEqual(out.cpu(), result.cpu()) def test_quantile_backward(self, device): @@ -1828,6 +1831,8 @@ def check(a, q, args, kwargs, message): check([1.], 1.1, [], {}, r'q must be in the range \[0, 1\] but got 1.1') check([1.], 0.5, [], {'out': torch.empty([], dtype=torch.float64, device=device)}, r'out tensor must be same dtype as the input tensor') + check([1.], [1.], [], {'interpolation': 'random_mode'}, + r"interpolation should only be linear, lower, higher, midpoint, nearest.") if self.device_type == "cpu": check([1.], [0.5, 1.1, -1], [], {}, r'q values must be in the range \[0, 1\]') diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index d9f7e8018264..6afd68ab2404 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -5182,13 +5182,24 @@ def merge_dicts(*dicts): >>> torch.quantile(a, q) tensor([-0.5446, 0.0700, 0.9214]) -.. function:: quantile(input, q, dim=None, keepdim=False, *, out=None) -> Tensor +.. function:: quantile(input, q, dim=None, interpolation='linear', keepdim=False, *, out=None) -> Tensor -Returns the q-th quantiles of each row of the :attr:`input` tensor along the dimension -:attr:`dim`, doing a linear interpolation when the q-th quantile lies between two -data points. By default, :attr:`dim` is ``None`` resulting in the :attr:`input` tensor +Returns the q-th quantiles of each row of the :attr:`input` tensor +along the dimension :attr:`dim` based on :attr:`interpolation`. +When the desired quantile lies between two data points ``i < j``, +the result is computed based on the :attr:`interpolation` value as described below. +By default, :attr:`interpolation` is ``linear`` and :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation. +When the quantile value lies between two data points ``i < j``, +the result is computed according to the given :attr:`interpolation` method as follows: + +- ``linear``: ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. +- ``lower``: ``i``. +- ``higher``: ``j``. +- ``nearest``: ``i`` or ``j``, whichever is nearest. +- ``midpoint``: ``(i + j) / 2``. + If :attr:`keepdim` is ``True``, the output dimensions are of the same size as :attr:`input` except in the dimensions being reduced (:attr:`dim` or all if :attr:`dim` is ``None``) where they have size 1. Otherwise, the dimensions being reduced are squeezed (see :func:`torch.squeeze`). @@ -5199,6 +5210,8 @@ def merge_dicts(*dicts): {input} q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1] {dim} + interpolation (string): interpolation method to use when the desired quantile lies between two data points, + can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. Default is ``linear``. {keepdim} Keyword arguments: @@ -5222,11 +5235,24 @@ def merge_dicts(*dicts): [ 0.9206]]]) >>> torch.quantile(a, q, dim=1, keepdim=True).shape torch.Size([3, 2, 1]) + >>> a = torch.arange(4.) + >>> a + tensor([0., 1., 2., 3.]) + >>> torch.quantile(a, 0.6, interpolation='linear') + tensor(1.8000) + >>> torch.quantile(a, 0.6, interpolation='lower') + tensor(1.) + >>> torch.quantile(a, 0.6, interpolation='higher') + tensor(2.) + >>> torch.quantile(a, 0.6, interpolation='midpoint') + tensor(1.5000) + >>> torch.quantile(a, 0.6, interpolation='nearest') + tensor(2.) """.format(**single_dim_common)) add_docstr(torch.nanquantile, r""" -nanquantile(input, q, dim=None, keepdim=False, *, out=None) -> Tensor +nanquantile(input, q, dim=None, interpolation='linear', keepdim=False, *, out=None) -> Tensor This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values, computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did @@ -5237,6 +5263,8 @@ def merge_dicts(*dicts): {input} q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1] {dim} + interpolation (string): interpolation method to use when the desired quantile lies between two data points, + can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. Default is ``linear``. {keepdim} Keyword arguments: diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index b88dcaaccb33..621ff2a12e7e 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1272,13 +1272,13 @@ def method_tests(): ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', (), [1]), ('quantile', (S, S, S), (0.5,)), ('quantile', (S, S, S), (0.5, 0), 'dim', (), [1]), - ('quantile', (S, S, S), (0.5, None, True), 'keepdim'), - ('quantile', (S, S, S), (0.5, 0, True), 'keepdim_dim', (), [1]), + ('quantile', (S, S, S), (0.5, None, 'linear', True), 'keepdim'), + ('quantile', (S, S, S), (0.5, 0, 'linear', True), 'keepdim_dim', (), [1]), ('quantile', (), (0.5,), 'scalar'), ('nanquantile', (S, S, S), (0.5,)), ('nanquantile', (S, S, S), (0.5, 0), 'dim', (), [1]), - ('nanquantile', (S, S, S), (0.5, None, True), 'keepdim'), - ('nanquantile', (S, S, S), (0.5, 0, True), 'keepdim_dim', (), [1]), + ('nanquantile', (S, S, S), (0.5, None, 'linear', True), 'keepdim'), + ('nanquantile', (S, S, S), (0.5, 0, 'linear', True), 'keepdim_dim', (), [1]), ('nanquantile', (), (0.5,), 'scalar'), ('median', (S, S, S), NO_ARGS), ('median', (S, S, S), (1,), 'dim', (), [0]), From 54f0556ee453d1c4a9c6756bfcdce1e0d62867a8 Mon Sep 17 00:00:00 2001 From: Kurt Mohler Date: Thu, 10 Dec 2020 10:21:39 -0800 Subject: [PATCH 127/250] Add missing complex support for torch.norm and torch.linalg.norm (#48284) Summary: **BC-breaking note:** Previously, when given a complex input, `torch.linalg.norm` and `torch.norm` would return a complex output. `torch.linalg.cond` would sometimes return a complex output and sometimes return a real output when given a complex input, depending on its `p` argument. This PR changes this behavior to match `numpy.linalg.norm` and `numpy.linalg.cond`, so that a complex input will result in the downgraded real number type, consistent with NumPy. **PR Summary:** The following cases were previously unsupported for complex inputs, and this commit adds support: - Frobenius norm - Norm order 2 (vector and matrix) - CUDA vector norm Part of https://github.com/pytorch/pytorch/issues/47833 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48284 Reviewed By: H-Huang Differential Revision: D25420880 Pulled By: mruberry fbshipit-source-id: 11f6a2f3cad57d66476d30921c3f6ab8f3cd4017 --- aten/src/ATen/native/LinearAlgebra.cpp | 33 ++- aten/src/ATen/native/ReduceOps.cpp | 17 +- aten/src/ATen/native/SharedReduceOps.h | 108 +++++--- aten/src/ATen/native/cpu/ReduceOpsKernel.cpp | 54 ++-- aten/src/ATen/native/cuda/ReduceNormKernel.cu | 41 +-- test/test_linalg.py | 255 ++++++++++-------- torch/csrc/autograd/FunctionsManual.cpp | 6 +- torch/functional.py | 8 +- torch/linalg/__init__.py | 9 +- 9 files changed, 318 insertions(+), 213 deletions(-) diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index bbc8d29dfab7..1c3b9ca60c1c 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1463,14 +1463,13 @@ Tensor matrix_power(const Tensor& a, int64_t n) { } Tensor frobenius_norm(const Tensor& self) { - TORCH_CHECK(!self.is_complex(), "frobenius norm not supported for complex tensors"); return at::norm(self); } Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { // NOTE: As frobenius_norm_out is currently implemented, it will always produce a // strided tensor result, even if the input is sparse. - auto options = self.options().layout(c10::Layout::Strided); + auto options = self.options().layout(c10::Layout::Strided).dtype(toValueType(self.scalar_type())); Tensor result = at::empty({0}, options); return at::native::frobenius_norm_out(result, self, dim, keepdim); } @@ -1480,7 +1479,6 @@ Tensor &frobenius_norm_out( const Tensor& self, IntArrayRef dim, bool keepdim) { - TORCH_CHECK(!self.is_complex(), "frobenius norm not supported for complex tensors"); TORCH_CHECK( dim.size() <= 2, "Expected at most 2 dimensions, but got ", @@ -1524,7 +1522,7 @@ Tensor &nuclear_norm_out(Tensor& result, const Tensor& self, bool keepdim) { } Tensor nuclear_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { - Tensor result = at::empty({0}, self.options()); + Tensor result = at::empty({0}, self.options().dtype(toValueType(self.scalar_type()))); return at::native::nuclear_norm_out(result, self, dim, keepdim); } @@ -1679,7 +1677,7 @@ static Tensor& _linalg_norm_vector_out(Tensor& result, const Tensor& self, optio // when the input contains extreme values (like nan or +/-inf) or if the input // size is degenerate (like size(0), size(0, N), etc) case_was_overridden = true; - self_ = self.abs(); + self_ = self_.abs(); result_ = _norm_min_max(self_, ord, dim[0], keepdim); } else if ((self_.numel() == 0) && (ord < 0)) { // For negative orders with degenerate input sizes, at::norm's result does not @@ -1698,7 +1696,7 @@ static Tensor& _linalg_norm_vector_out(Tensor& result, const Tensor& self, optio } if (!case_was_overridden) { if (opt_dtype.has_value()) { - result_ = at::norm(self, opt_ord, dim, keepdim, opt_dtype.value()); + result_ = at::norm(self.to(opt_dtype.value()), opt_ord, dim, keepdim); } else { result_ = at::norm(self, opt_ord, dim, keepdim); } @@ -1749,14 +1747,14 @@ static Tensor& linalg_norm_out_impl(Tensor& result, const Tensor& self, optional // Numerical or None norms Tensor linalg_norm(const Tensor& self, optional opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { - auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type()).device(self.device()); + auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device()); Tensor result = at::empty({0}, options); return at::native::linalg_norm_out(result, self, opt_ord, opt_dim, keepdim, opt_dtype); } // Frobenius and nuclear norms Tensor linalg_norm(const Tensor& self, std::string ord, optional opt_dim, bool keepdim, optional opt_dtype) { - auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type()).device(self.device()); + auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device()); Tensor result = at::empty({0}, options); return at::native::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } @@ -1781,7 +1779,8 @@ Tensor _linalg_cond_exception_helper(const Tensor& self) { "linalg_cond does not support yet this case."); } auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2); - Tensor result = at::full(result_shape, INFINITY, self.options()); + TensorOptions options = self.options().dtype(toValueType(self.scalar_type())); + Tensor result = at::full(result_shape, INFINITY, options); return result; } @@ -1816,7 +1815,8 @@ Tensor _linalg_cond_helper(const Tensor& self, c10::variant // Return zero for each matrix in the batch Tensor _linalg_cond_empty_matrix(const Tensor& self, c10::ScalarType dtype) { auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2); - return at::zeros(result_shape, self.options().dtype(dtype)); + TensorOptions options = self.options().dtype(toValueType(self.scalar_type())); + return at::zeros(result_shape, options); } void _linalg_cond_check_ord(c10::variant ord_variant) { @@ -1849,8 +1849,7 @@ Tensor linalg_cond(const Tensor& self, optional opt_ord) { // NumPy doesn't define the condition number for 0x0 matrices, we return 0.0 for such input if (self.numel() == 0) { auto real_dtype = toValueType(typeMetaToScalarType(self.dtype())); - auto expected_dtype = std::abs(ord.toDouble()) == 2.0 ? real_dtype : self.scalar_type(); - return _linalg_cond_empty_matrix(self, expected_dtype); + return _linalg_cond_empty_matrix(self, real_dtype); } // If ord == None or ord == ±2 @@ -1883,10 +1882,9 @@ Tensor& linalg_cond_out(Tensor& result, const Tensor& self, optional opt // the result is always real-valued, for other cases it is complex-valued for the complex-valued input. ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype())); Scalar ord = opt_ord.has_value() ? opt_ord.value() : 2; - auto expected_dtype = std::abs(ord.toDouble()) == 2.0 ? real_dtype : self.scalar_type(); - TORCH_CHECK(result.scalar_type() == expected_dtype, - "result dtype ", result.scalar_type(), " does not match the expected dtype ", expected_dtype); + TORCH_CHECK(result.scalar_type() == real_dtype, + "result dtype ", result.scalar_type(), " does not match the expected dtype ", real_dtype); Tensor result_tmp = at::linalg_cond(self, opt_ord); at::native::resize_output(result, result_tmp.sizes()); @@ -1916,8 +1914,9 @@ Tensor linalg_cond(const Tensor& self, std::string ord) { // TODO: implement _out variant avoiding copy and using already allocated storage directly Tensor& linalg_cond_out(Tensor& result, const Tensor& self, std::string ord) { - TORCH_CHECK(result.scalar_type() == self.scalar_type(), - "result dtype ", result.scalar_type(), " does not match the expected dtype ", self.scalar_type()); + ScalarType real_type = toValueType(self.scalar_type()); + TORCH_CHECK(result.scalar_type() == real_type, + "result dtype ", result.scalar_type(), " does not match the expected dtype ", real_type); Tensor result_tmp = at::linalg_cond(self, ord); at::native::resize_output(result, result_tmp.sizes()); diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 364c112572b5..e4b0a1cb19b7 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -660,22 +660,23 @@ Tensor& logsumexp_out(Tensor& result, const Tensor& self, DimnameList dims, bool static Tensor& norm_out(Tensor &result, const Tensor &self, optional opt_p, IntArrayRef dim, bool keepdim, optional opt_dtype) { - auto p = opt_p.value_or(2.0); - TORCH_CHECK(!(p.toDouble() == 2 && self.is_complex()), "norm with p=2 not supported for complex tensors"); + auto p = opt_p.value_or(2.0).to(); TORCH_CHECK(self.device().type() == DeviceType::CPU || self.device().type() == DeviceType::CUDA, "norm only supports CPU AND CUDA device type, got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided, "norm only supports strided layout, got: ", self.layout()); - ScalarType scalarType = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type(); + ScalarType in_dtype = opt_dtype.has_value() ? opt_dtype.value() : self.scalar_type(); TORCH_CHECK( - at::isFloatingType(scalarType) || at::isComplexType(scalarType), - "Can only calculate the mean of floating types. Got ", - toString(scalarType), + at::isFloatingType(in_dtype) || at::isComplexType(in_dtype), + "Can only calculate the norm of floating point and complex dtypes. Got ", + toString(in_dtype), " instead."); - ScalarType dtype = get_dtype(result, self, opt_dtype, true); - auto iter = make_reduction("norm", result, self, dim, keepdim, dtype); + ScalarType out_dtype = result.defined() ? result.scalar_type() : (opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())); + + auto iter = make_reduction("norm", result, self, dim, keepdim, in_dtype, out_dtype); + if (iter.numel() == 0) { result.zero_(); } else { diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index 437a39bf2b92..4106a90c0729 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -2,6 +2,8 @@ // Please note that this file is // used across both CPU and GPU. +#include +#include #include #include #include @@ -157,11 +159,15 @@ struct MeanOps { } }; -template +// This accumulator template is used to calculate the minimum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template struct AbsMinOps { - inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { - return MIN(acc, acc_t(std::abs(data))); + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MIN(acc, static_cast(std::abs(data))); } inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { @@ -177,17 +183,21 @@ struct AbsMinOps { } #if defined(__CUDACC__) || defined(__HIPCC__) - inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { - return WARP_SHFL_DOWN(data, offset); + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); } #endif }; -template +// This accumulator template is used to calculate the maximum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template struct AbsMaxOps { - inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { - return MAX(acc, acc_t(std::abs(data))); + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MAX(acc, static_cast(std::abs(data))); } inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { @@ -203,18 +213,22 @@ struct AbsMaxOps { } #if defined(__CUDACC__) || defined(__HIPCC__) - inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { - return WARP_SHFL_DOWN(data, offset); + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); } #endif }; -template +// This accumulator template is used to calculate the norm of the absolute value +// of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template struct NormOps { acc_t norm_; - inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { - return acc + compat_pow(std::abs(data), norm_); + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + compat_pow(static_cast(std::abs(data)), norm_); } inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { @@ -222,7 +236,7 @@ struct NormOps { } inline C10_DEVICE acc_t project(acc_t a) const { - return compat_pow(a, acc_t(1.0)/norm_); + return compat_pow(a, static_cast(1.0) / norm_); } static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { @@ -230,8 +244,8 @@ struct NormOps { } #if defined(__CUDACC__) || defined(__HIPCC__) - inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { - return WARP_SHFL_DOWN(data, offset); + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); } #endif @@ -239,10 +253,14 @@ struct NormOps { } }; -template +// This accumulator template is used to calculate the order zero norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template struct NormZeroOps { - inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { - return acc + (data==acc_t(0) ? acc_t(0) : acc_t(1)); + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + (data == static_cast(0) ? static_cast(0) : static_cast(1)); } inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { @@ -259,16 +277,20 @@ struct NormZeroOps { #if defined(__CUDACC__) || defined(__HIPCC__) - inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { - return WARP_SHFL_DOWN(data, offset); + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); } #endif }; -template +// This accumulator template is used to calculate the order one norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template struct NormOneOps { - inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { - return acc + std::abs(data); + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + static_cast(std::abs(data)); } inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { @@ -284,16 +306,40 @@ struct NormOneOps { } #if defined(__CUDACC__) || defined(__HIPCC__) - inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { - return WARP_SHFL_DOWN(data, offset); + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); } #endif }; -template + +template +struct AbsSwitch {}; + +template +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch s) { + return static_cast(data); +} + +template +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch s) { + return static_cast(std::abs(data)); +} + +template +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch s) { + return static_cast(std::abs(data)); +} + +// This accumulator template is used to calculate the order two norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template struct NormTwoOps { - inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { - return acc + data * data; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + acc_t data_ = abs_if_complex(data, AbsSwitch()); + return acc + data_ * data_; } inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { @@ -309,8 +355,8 @@ struct NormTwoOps { } #if defined(__CUDACC__) || defined(__HIPCC__) - inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { - return WARP_SHFL_DOWN(data, offset); + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); } #endif }; diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 6ed9c798be23..10437f51d4b4 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -174,61 +174,75 @@ static void norm_kernel_tensor_iterator_impl( if (p.isIntegral(false)) { val = p.to(); } else if (p.isFloatingPoint()) { - val = p.to(); + val = p.to(); } else { AT_ERROR("norm_kernel_tensor_iterator_impl expects norm to be integer or float"); } - + // In the dispatch code blocks below, reduction kernels accumulate results as + // the type `acc_t`. When `scalar_t` is complex, `acc_t` is the downgraded + // real number type. Otherwise, `acc_t` and `scalar_t` are the same type. if (val == 0) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "norm_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] { + using acc_t = typename scalar_value_type::type; binary_kernel_reduce( iter, - NormZeroOps(), - scalar_t(0) + NormZeroOps(), + acc_t(0) ); }); } else if (val == 1) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "norm_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] { + using acc_t = typename scalar_value_type::type; binary_kernel_reduce( iter, - NormOneOps(), - scalar_t(0) + NormOneOps(), + acc_t(0) ); }); } else if (val == 2) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "norm_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] { + using acc_t = typename scalar_value_type::type; binary_kernel_reduce( iter, - NormTwoOps(), - scalar_t(0) + NormTwoOps(), + acc_t(0) ); }); } else if (val == INFINITY) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "norm_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] { + using acc_t = typename scalar_value_type::type; binary_kernel_reduce( iter, - AbsMaxOps(), - scalar_t(std::numeric_limits::min()) + AbsMaxOps(), + std::numeric_limits::min() ); }); } else if (val == -INFINITY) { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "norm_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] { + using acc_t = typename scalar_value_type::type; binary_kernel_reduce( iter, - AbsMinOps(), - scalar_t(std::numeric_limits::max()) + AbsMinOps(), + std::numeric_limits::max() ); }); } else { - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "norm_cpu", [&] { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "norm_cpu", [&] { + using acc_t = typename scalar_value_type::type; binary_kernel_reduce( iter, - NormOps { scalar_t(val) }, - scalar_t(0) + NormOps { acc_t(val) }, + acc_t(0) ); }); } + + // For complex outputs, the above kernels do not touch the imaginary values, + // so we must zero them out + if (isComplexType(iter.output().scalar_type())) { + at::imag(iter.output()).zero_(); + } } static void and_kernel_impl(TensorIterator& iter) { diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu index a857dbc52b8a..3953f16b69c9 100644 --- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu @@ -7,46 +7,49 @@ namespace at { namespace native { -template +// This reduction accumulates results as the type `acc_t`. By default, when +// `scalar_t` is complex, `acc_t` is the downgraded real number type. +// Otherwise, `acc_t` and `scalar_t` are the same type. +template ::type, typename out_t=typename scalar_value_type::type> void norm_kernel_cuda_impl(TensorIterator& iter, Scalar val) { - float p; + double p; if (val.isIntegral(false)) { p = val.to(); } else if (val.isFloatingPoint()) { - p = val.to(); + p = val.to(); } else { AT_ERROR("norm_kernel_cuda_impl expects norm to be integer or float"); } - if (p == static_cast(0)) { - gpu_reduce_kernel(iter, NormZeroOps(), 0); - } else if (p == static_cast(1)) { - gpu_reduce_kernel(iter, NormOneOps(), 0); - } else if (p == static_cast(2)) { - gpu_reduce_kernel(iter, NormTwoOps(), 0); - } else if (p == static_cast(INFINITY)) { - gpu_reduce_kernel(iter, AbsMaxOps(), std::numeric_limits::min()); - } else if (p == static_cast(-INFINITY)) { - gpu_reduce_kernel(iter, AbsMinOps(), std::numeric_limits::max()); + if (p == static_cast(0)) { + gpu_reduce_kernel(iter, NormZeroOps(), 0); + } else if (p == static_cast(1)) { + gpu_reduce_kernel(iter, NormOneOps(), 0); + } else if (p == static_cast(2)) { + gpu_reduce_kernel(iter, NormTwoOps(), 0); + } else if (p == static_cast(INFINITY)) { + gpu_reduce_kernel(iter, AbsMaxOps(), std::numeric_limits::min()); + } else if (p == static_cast(-INFINITY)) { + gpu_reduce_kernel(iter, AbsMinOps(), std::numeric_limits::max()); } else { - gpu_reduce_kernel(iter, NormOps{ acc_t(p) }, 0); + gpu_reduce_kernel(iter, NormOps{ acc_t(p) }, 0); } } static void norm_kernel_cuda(TensorIterator& iter, Scalar p) { - if (iter.dtype() == kHalf) { + if (iter.input_dtype() == kHalf) { return norm_kernel_cuda_impl(iter, p); - } else if (iter.dtype(1) == kHalf && iter.dtype() == kFloat) { + } else if (iter.dtype(1) == kHalf && iter.input_dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel return norm_kernel_cuda_impl(iter, p); } - else if(iter.dtype() == kBFloat16) { + else if(iter.input_dtype() == kBFloat16) { return norm_kernel_cuda_impl(iter, p); - } else if (iter.dtype(1) == kBFloat16 && iter.dtype() == kFloat) { + } else if (iter.dtype(1) == kBFloat16 && iter.input_dtype() == kFloat) { // type promotion that does cast and reduction in a single kernel return norm_kernel_cuda_impl(iter, p); } - AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "norm_cuda", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.input_dtype(), "norm_cuda", [&] { norm_kernel_cuda_impl(iter, p); }); } diff --git a/test/test_linalg.py b/test/test_linalg.py index 1b5e2f0ee712..5e7e0c273dcf 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -867,23 +867,61 @@ def test_kron_errors_and_warnings(self, device, dtype): # as expected, according to the function's documentation @skipCUDAIfNoMagma def test_norm_dtype(self, device): - def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype): + def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype): + # Determine the best dtype to use for comparisons between tensors + # of two different types + def get_compare_dtype(type0, type1): + types_32bit_based = [torch.float, torch.cfloat] + is_complex = type0.is_complex or type1.is_complex + + if type0 in types_32bit_based or type1 in types_32bit_based: + return torch.cfloat if is_complex else torch.float + else: + return torch.cdouble if is_complex else torch.double + + compare_dtype = get_compare_dtype(from_dtype, to_dtype) + + def get_value_type(dtype): + if dtype == torch.cfloat: + return torch.float + elif dtype == torch.cdouble: + return torch.double + elif dtype == torch.complex32: + return torch.float16 + else: + return dtype + msg = ( f'input_size={input_size}, ord={ord}, keepdim={keepdim}, ' f'from_dtype={from_dtype}, to_dtype={to_dtype}') input = torch.randn(*input_size, dtype=from_dtype, device=device) - result = torch.linalg.norm(input, ord, keepdim=keepdim, dtype=from_dtype) - self.assertEqual(result.dtype, from_dtype, msg=msg) - result_converted = torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype) - self.assertEqual(result_converted.dtype, to_dtype, msg=msg) - self.assertEqual(result.to(compare_dtype), result_converted.to(compare_dtype), msg=msg) + result = torch.linalg.norm(input, ord, keepdim=keepdim) + if from_dtype.is_complex: + # By default, norm downgrades a complex input to the corresponding real number type + self.assertEqual(result.dtype, get_value_type(from_dtype), msg=msg) + else: + self.assertEqual(result.dtype, from_dtype, msg=msg) - result_out_converted = torch.empty_like(result_converted) - torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype, out=result_out_converted) - self.assertEqual(result_out_converted.dtype, to_dtype, msg=msg) - self.assertEqual(result_converted, result_out_converted, msg=msg) + result_out = torch.empty((), dtype=to_dtype, device=device) + torch.linalg.norm(input, ord, keepdim=keepdim, out=result_out) + self.assertEqual(result_out.dtype, to_dtype, msg=msg) + self.assertEqual(result.to(compare_dtype), result_out.to(compare_dtype), msg=msg) - ord_vector = [0, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None] + result_with_dtype = torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype) + self.assertEqual(result_with_dtype.dtype, to_dtype, msg=msg) + + if from_dtype.is_complex: + result_convert_first = torch.linalg.norm(input.to(to_dtype), ord, keepdim=keepdim) + self.assertEqual(result_with_dtype.to(compare_dtype), result_convert_first.to(compare_dtype), msg=msg) + else: + self.assertEqual(result.to(compare_dtype), result_with_dtype.to(compare_dtype), msg=msg) + + result_out_with_dtype = torch.empty_like(result_with_dtype) + torch.linalg.norm(input, ord, keepdim=keepdim, dtype=to_dtype, out=result_out_with_dtype) + self.assertEqual(result_out_with_dtype.dtype, to_dtype, msg=msg) + self.assertEqual(result_with_dtype, result_out_with_dtype, msg=msg) + + ord_vector = [0, 0.1, -0.1, 1, -1, 2, -2, 3, -3, 4.5, -4.5, inf, -inf, None] ord_matrix = ['fro', 'nuc', 1, -1, 2, -2, inf, -inf, None] S = 10 test_cases = [ @@ -893,15 +931,16 @@ def run_test_case(input_size, ord, keepdim, from_dtype, to_dtype, compare_dtype) for keepdim in [True, False]: for input_size, ord_settings in test_cases: for ord in ord_settings: - # float to double - run_test_case(input_size, ord, keepdim, torch.float, torch.double, torch.float) - # double to float - run_test_case(input_size, ord, keepdim, torch.double, torch.double, torch.float) + dtypes = [torch.float, torch.double, torch.cfloat, torch.cdouble] + for from_dtype, to_dtype in itertools.product(dtypes, dtypes): + run_test_case(input_size, ord, keepdim, from_dtype, to_dtype) # Make sure that setting dtype != out.dtype raises an error dtype_pairs = [ (torch.float, torch.double), (torch.double, torch.float), + (torch.cfloat, torch.cdouble), + (torch.cdouble, torch.cfloat), ] for keepdim in [True, False]: for input_size, ord_settings in test_cases: @@ -1008,11 +1047,6 @@ def run_test_case(input, p): for input_size in input_sizes: input = torch.randn(*input_size, dtype=dtype, device=device) for p in norm_types: - # frobenius norm not supported for complex tensors - if dtype.is_complex and p == 'fro': - with self.assertRaisesRegex(RuntimeError, "frobenius norm not supported for complex tensors"): - torch.linalg.cond(input, p) - continue run_test_case(input, p) # test empty batch sizes @@ -1040,7 +1074,7 @@ def run_test_case(input, p): for input_size in input_sizes: input = torch.randn(*input_size, dtype=dtype, device=device) for p in ['fro', 2]: - expected_dtype = a.real.dtype if dtype.is_complex and p == 2 else dtype + expected_dtype = a.real.dtype if dtype.is_complex else dtype expected = torch.zeros(input_size[:-2], dtype=expected_dtype, device=device) actual = torch.linalg.cond(input, p) self.assertEqual(actual, expected) @@ -1068,7 +1102,7 @@ def test_cond_errors_and_warnings(self, device, dtype): # if non-empty out tensor with wrong shape is passed a warning is given a = torch.ones((2, 2), dtype=dtype, device=device) for p in ['fro', 2]: - real_dtype = a.real.dtype if dtype.is_complex and p == 2 else dtype + real_dtype = a.real.dtype if dtype.is_complex else dtype out = torch.empty(a.shape, dtype=real_dtype, device=device) with warnings.catch_warnings(record=True) as w: # Trigger warning @@ -1231,8 +1265,7 @@ def run_error_test_case(input, ord, dim, keepdim, error_type, error_regex): for ord in ord_settings: run_error_test_case(input, ord, dim, keepdim, error_type, error_regex) - # Test complex number inputs for linalg.norm. Some cases are not supported yet, so - # this test also verifies that those cases raise an error. + # Test complex number inputs for linalg.norm @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.cfloat, torch.cdouble) @@ -1241,72 +1274,95 @@ def gen_error_message(input_size, ord, keepdim, dim=None): return "complex norm failed for input size %s, ord=%s, keepdim=%s, dim=%s" % ( input_size, ord, keepdim, dim) - if self.device_type == 'cpu': - supported_vector_ords = [0, 1, 3, inf, -1, -2, -3, -inf] - supported_matrix_ords = ['nuc', 1, 2, inf, -1, -2, -inf] - unsupported_vector_ords = [ - (2, r'norm with p=2 not supported for complex tensors'), - (None, r'norm with p=2 not supported for complex tensors'), - ] - unsupported_matrix_ords = [ - ('fro', r'frobenius norm not supported for complex tensors'), - (None, r'norm with p=2 not supported for complex tensors'), - ] - - elif self.device_type == 'cuda': - supported_vector_ords = [inf, -inf] - supported_matrix_ords = [1, inf, -1, -inf] - unsupported_vector_ords = [ - (0, r'norm_cuda" not implemented for \'Complex'), - (1, r'norm_cuda" not implemented for \'Complex'), - (2, r'norm with p=2 not supported for complex tensors'), - (-1, r'norm_cuda" not implemented for \'Complex'), - (-2, r'norm_cuda" not implemented for \'Complex'), - (None, r'norm with p=2 not supported for complex tensors'), - ] - unsupported_matrix_ords = [ - (None, r'norm with p=2 not supported for complex tensors'), - ('fro', r'frobenius norm not supported for complex tensors'), - ] + vector_ords = [None, 0, 1, 2, 3, inf, -1, -2, -3, -inf] + matrix_ords = [None, 'fro', 'nuc', 1, 2, inf, -1, -2, -inf] # Test supported ords for keepdim in [False, True]: # vector norm x = torch.randn(25, device=device, dtype=dtype) xn = x.cpu().numpy() - for ord in supported_vector_ords: + for ord in vector_ords: res = torch.linalg.norm(x, ord, keepdim=keepdim).cpu() expected = np.linalg.norm(xn, ord, keepdims=keepdim) msg = gen_error_message(x.size(), ord, keepdim) self.assertEqual(res.shape, expected.shape, msg=msg) self.assertEqual(res, expected, msg=msg) + res_out = torch.Tensor().to(device) + torch.linalg.norm(x, ord, keepdim=keepdim, out=res_out) + self.assertEqual(res_out.shape, expected.shape, msg=msg) + self.assertEqual(res_out.cpu(), expected, msg=msg) + # matrix norm x = torch.randn(25, 25, device=device, dtype=dtype) xn = x.cpu().numpy() - for ord in supported_matrix_ords: - # TODO: Need to fix abort when nuclear norm is given cdouble input: - # "double free or corruption (!prev) Aborted (core dumped)" - if ord == 'nuc' and dtype == torch.cdouble: - continue + for ord in matrix_ords: res = torch.linalg.norm(x, ord, keepdim=keepdim).cpu() expected = np.linalg.norm(xn, ord, keepdims=keepdim) msg = gen_error_message(x.size(), ord, keepdim) self.assertEqual(res.shape, expected.shape, msg=msg) self.assertEqual(res, expected, msg=msg) - # Test unsupported ords - # vector norm - x = torch.randn(25, device=device, dtype=dtype) - for ord, error_msg in unsupported_vector_ords: - with self.assertRaisesRegex(RuntimeError, error_msg): - torch.linalg.norm(x, ord) + res_out = torch.Tensor().to(device) + torch.linalg.norm(x, ord, keepdim=keepdim, out=res_out) + self.assertEqual(res_out.shape, expected.shape, msg=msg) + self.assertEqual(res_out.cpu(), expected, msg=msg) + + # Test complex number inputs for linalg.norm + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @dtypes(torch.cfloat, torch.cdouble) + def test_norm_complex_autograd(self, device, dtype): + def gen_error_message(input_size, ord, keepdim, dim=None): + return "complex norm autograd failed for input size %s, ord=%s, keepdim=%s, dim=%s" % ( + input_size, ord, keepdim, dim) + + if dtype == torch.cfloat: + dtype_real = torch.float + elif dtype == torch.cdouble: + dtype_real = torch.double + else: + raise RuntimeError(f'dtype not supported in this test: {dtype}') + + vector_ords = [None, 0, 1, 2, 3, inf, -1, -2, -3, -inf] + matrix_ords = [None, 'fro', 1, inf, -1, -inf] + + # TODO: Fix autograd for matrix orders 'nuc', 2, and -2 by adding complex + # support to svd's backward method. Once this is done, these ords + # should be added to `matrix_ords` above + matrix_ords_unsupported = ['nuc', 2, -2] + + def run_test_case(x, ord, keepdim): + res = torch.linalg.norm(x, ord, keepdim=keepdim) + res.backward() + + x_real = x.clone().detach().abs().requires_grad_(True) + res_real = torch.linalg.norm(x_real, ord, keepdim=keepdim) + res_real.backward() + + msg = gen_error_message(x.size(), ord, keepdim) - # matrix norm - x = torch.randn(25, 25, device=device, dtype=dtype) - for ord, error_msg in unsupported_matrix_ords: - with self.assertRaisesRegex(RuntimeError, error_msg): - torch.linalg.norm(x, ord) + self.assertEqual(res.shape, res_real.shape, msg=msg) + self.assertEqual(res, res_real, msg=msg) + self.assertEqual(x.grad.abs(), x_real.grad, msg=msg) + + # Test supported ords + for keepdim in [False, True]: + for ord in vector_ords: + x = torch.randn(25, dtype=dtype, device=device, requires_grad=True) + run_test_case(x, ord, keepdim) + + for ord in matrix_ords: + x = torch.randn(25, 25, dtype=dtype, device=device, requires_grad=True) + run_test_case(x, ord, keepdim) + + for ord in matrix_ords_unsupported: + x = torch.randn(25, 25, dtype=dtype, device=device, requires_grad=True) + with self.assertRaisesRegex( + RuntimeError, + r'svd does not support automatic differentiation for outputs with complex dtype'): + res = torch.linalg.norm(x, ord, keepdim=keepdim) # Test that linal.norm gives the same result as numpy when inputs # contain extreme values (inf, -inf, nan) @@ -1370,12 +1426,6 @@ def run_test_case(input, ord, dim, keepdim, should_error): with self.assertRaises(RuntimeError): torch.linalg.norm(input, ord, dim, keepdim) else: - if dtype in [torch.cfloat, torch.cdouble] and ord in [2, None]: - # TODO: Once these ord values have support for complex numbers, - # remove this error test case - with self.assertRaises(RuntimeError): - torch.linalg.norm(input, ord, dim, keepdim) - return result_numpy = np.linalg.norm(input_numpy, ord, dim, keepdim) result = torch.linalg.norm(input, ord, dim, keepdim) self.assertEqual(result, result_numpy, msg=msg) @@ -1402,12 +1452,6 @@ def run_test_case(input, ord, dim, keepdim, should_error): @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_norm_matrix_degenerate_shapes(self, device, dtype): def run_test_case(input, ord, dim, keepdim, should_error): - if dtype in [torch.cfloat, torch.cdouble] and ord in ['fro', None]: - # TODO: Once these ord values have support for complex numbers, - # remove this error test case - with self.assertRaises(RuntimeError): - torch.linalg.norm(input, ord, dim, keepdim) - return msg = f'input.size()={input.size()}, ord={ord}, dim={dim}, keepdim={keepdim}, dtype={dtype}' input_numpy = input.cpu().numpy() if should_error: @@ -1668,39 +1712,26 @@ def gen_error_message(input_size, p, keepdim, dim=None): return "complex norm failed for input size %s, p=%s, keepdim=%s, dim=%s" % ( input_size, p, keepdim, dim) - if device == 'cpu': - for keepdim in [False, True]: - # vector norm - x = torch.randn(25, device=device) + 1j * torch.randn(25, device=device) - xn = x.cpu().numpy() - for p in [0, 1, 3, inf, -1, -2, -3, -inf]: - res = x.norm(p, keepdim=keepdim).cpu() - expected = np.linalg.norm(xn, p, keepdims=keepdim) - msg = gen_error_message(x.size(), p, keepdim) - self.assertEqual(res.shape, expected.shape, msg=msg) - self.assertEqual(res, expected, msg=msg) - - # matrix norm - x = torch.randn(25, 25, device=device) + 1j * torch.randn(25, 25, device=device) - xn = x.cpu().numpy() - for p in ['nuc']: - res = x.norm(p, keepdim=keepdim).cpu() - expected = np.linalg.norm(xn, p, keepdims=keepdim) - msg = gen_error_message(x.size(), p, keepdim) - self.assertEqual(res.shape, expected.shape, msg=msg) - self.assertEqual(res, expected, msg=msg) - - # TODO: remove error test and add functionality test above when 2-norm support is added - with self.assertRaisesRegex(RuntimeError, r'norm with p=2 not supported for complex tensors'): - x = torch.randn(2, device=device, dtype=torch.complex64).norm(p=2) - - # TODO: remove error test and add functionality test above when frobenius support is added - with self.assertRaisesRegex(RuntimeError, r'frobenius norm not supported for complex tensors'): - x = torch.randn(2, 2, device=device, dtype=torch.complex64).norm(p='fro') + for keepdim in [False, True]: + # vector norm + x = torch.randn(25, device=device) + 1j * torch.randn(25, device=device) + xn = x.cpu().numpy() + for p in [0, 1, 2, 3, inf, -1, -2, -3, -inf]: + res = x.norm(p, keepdim=keepdim).cpu() + expected = np.linalg.norm(xn, p, keepdims=keepdim) + msg = gen_error_message(x.size(), p, keepdim) + self.assertEqual(res.shape, expected.shape, msg=msg) + self.assertEqual(res, expected, msg=msg) - elif device == 'cuda': - with self.assertRaisesRegex(RuntimeError, r'"norm_cuda" not implemented for \'ComplexFloat\''): - (1j * torch.randn(25)).norm() + # matrix norm + x = torch.randn(25, 25, device=device) + 1j * torch.randn(25, 25, device=device) + xn = x.cpu().numpy() + for p in ['nuc', 'fro']: + res = x.norm(p, keepdim=keepdim).cpu() + expected = np.linalg.norm(xn, p, keepdims=keepdim) + msg = gen_error_message(x.size(), p, keepdim) + self.assertEqual(res.shape, expected.shape, msg=msg) + self.assertEqual(res, expected, msg=msg) # Ensure torch.norm with p='fro' and p=2 give the same results for mutually supported input combinations @dtypes(torch.float) diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index e46c08cfecc7..4d71d6759e0c 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -153,15 +153,15 @@ Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional Date: Thu, 10 Dec 2020 10:27:21 -0800 Subject: [PATCH 128/250] Use new FFT operators in stft (#47601) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47601 Fixes https://github.com/pytorch/pytorch/issues/42175#issuecomment-719933913 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25457217 Pulled By: mruberry fbshipit-source-id: 455d216edd0b962eb7967ecb47cccc8d6865975b --- aten/src/ATen/native/SpectralOps.cpp | 41 ++++++++++++++++++++++------ torch/functional.py | 4 +++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 23e2caa71509..65d67629fa9f 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -710,12 +710,13 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop const bool complex_fft = input.is_complex(); const auto onesided = onesidedOpt.value_or(!complex_fft); + const fft_norm_mode norm = normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none; Tensor out; if (complex_fft) { TORCH_CHECK(!onesided, "Cannot have onesided output if window or input is complex"); - out = at::native::fft(at::view_as_real(input), 1, normalized); + out = at::_fft_c2c(input, input.dim() - 1, static_cast(norm), /*forward=*/true); } else { - out = at::native::rfft(input, 1, normalized, onesided); + out = at::_fft_r2c(input, input.dim() - 1, static_cast(norm), onesided); } out.transpose_(1, 2); @@ -724,12 +725,28 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop } if (return_complex) { - return at::view_as_complex(out); - } else { return out; + } else { + return at::view_as_real(out); } } +// Create complex tensor from the old style of real tensor with size=(..., 2) +// This is to support istft in the transition to requiring complex input. +// NOTE: This may return a view of the input tensor, or might clone if necessary +static Tensor as_complex(const Tensor& self) { + const bool can_view_as_complex = [&]{ + auto strides = self.strides(); + for (int64_t i = 0; i + 1 < strides.size(); ++i) { + if (strides[i] % 2 != 0) { + return false; + } + } + return strides.back() == 1 && self.storage_offset() % 2 == 0; + }(); + return at::view_as_complex(can_view_as_complex ? self : self.clone(MemoryFormat::Contiguous)); +} + /* Inverse Short-time Fourier Transform * * This is modeled after librosa but with support for complex time-domain @@ -756,6 +773,11 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional ho const auto hop_length = hop_lengthOpt.value_or(n_fft >> 2); const auto win_length = win_lengthOpt.value_or(n_fft); + if (!self.is_complex()) { + TORCH_WARN_ONCE( + "istft will require a complex-valued input tensor in a future PyTorch release. " + "Matching the output from stft with return_complex=True. "); + } Tensor input = self.is_complex() ? at::view_as_real(self) : self; const auto input_dim = input.dim(); const auto n_frames = input.size(-2); @@ -826,16 +848,19 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional ho input = input.unsqueeze(0); } - input = input.transpose(1, 2); // size: (channel, n_frames, fft_size, 2) + input = as_complex(input.transpose(1, 2)); // size: (channel, n_frames, fft_size, 2) + const fft_norm_mode norm = normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n; if (return_complex) { TORCH_CHECK(!onesided, "Cannot have onesided output if window or input is complex"); - input = at::native::ifft(input, 1, normalized); // size: (channel, n_frames, n_fft) - input = at::view_as_complex(input); + input = at::_fft_c2c(input, input.dim() - 1, static_cast(norm), /*forward=*/false); // size: (channel, n_frames, n_fft) } else { TORCH_CHECK(!window.defined() || !window.is_complex(), "Complex windows are incompatible with return_complex=False"); - input = at::native::irfft(input, 1, normalized, onesided, {n_fft,}); // size: (channel, n_frames, n_fft) + if (!onesided) { + input = input.slice(-1, 0, n_fft / 2 + 1); + } + input = at::_fft_c2r(input, input.dim() - 1, static_cast(norm), n_fft); // size: (channel, n_frames, n_fft) } TORCH_INTERNAL_ASSERT(input.size(2) == n_fft); diff --git a/torch/functional.py b/torch/functional.py index f856617e5a8f..f21fcda4566b 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -587,6 +587,10 @@ def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None, can either be complex (``channel``, ``fft_size``, ``n_frame``), or real (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel`` dimension is optional. + + .. deprecated:: 1.8.0 + Real input is deprecated, use complex inputs as returned by + ``stft(..., return_complex=True)`` instead. n_fft (int): Size of Fourier transform hop_length (Optional[int]): The distance between neighboring sliding window frames. (Default: ``n_fft // 4``) From 840e71f4e6a2c54ba48f782c007b45b1dfe08519 Mon Sep 17 00:00:00 2001 From: Yixin Bao Date: Thu, 10 Dec 2020 10:41:00 -0800 Subject: [PATCH 129/250] Check CUDA kernel launches (/fbcode/caffe2/) (#49145) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49145 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49105 (1) Add a safety check `C10_CUDA_KERNEL_LAUNCH_CHECK()` after each kernel launch. This diff only changes the files inside the directory /fbsource/fbcode/caffe2/modules/, /fbsource/fbcode/caffe2/fb/, /fbsource/fbcode/caffe2/test/. (2) Get rid of old check `AT_CUDA_CHECK(cudaGetLastError())` when necessary. Test Plan: Test build: ``` buck build mode/dev-nosan //caffe2/modules/detectron: buck test mode/dev-nosan //caffe2/modules/detectron: buck build mode/dev-nosan //caffe2/torch/fb/: buck test mode/dev-nosan //caffe2/torch/fb/: ``` To check for launches without checks: ``` python3 caffe2/torch/testing/check_kernel_launches.py ``` Make sure none of the updated files are in the returned list. Reviewed By: r-barnes Differential Revision: D25452852 fbshipit-source-id: d6657edab612c9e0fa99b29c68460be8b1a20064 --- modules/detectron/group_spatial_softmax_op.cu | 3 +++ modules/detectron/ps_roi_pool_op.cu | 2 ++ modules/detectron/roi_pool_f_op.cu | 2 ++ modules/detectron/select_smooth_l1_loss_op.cu | 2 ++ modules/detectron/sigmoid_cross_entropy_loss_op.cu | 5 +++++ modules/detectron/sigmoid_focal_loss_op.cu | 2 ++ modules/detectron/smooth_l1_loss_op.cu | 3 +++ modules/detectron/softmax_focal_loss_op.cu | 5 +++++ modules/detectron/spatial_narrow_as_op.cu | 2 ++ modules/detectron/upsample_nearest_op.cu | 3 +++ test/cpp_extensions/cuda_extension.cu | 2 ++ test/cpp_extensions/cuda_extension_kernel.cu | 2 ++ test/cpp_extensions/cuda_extension_kernel2.cu | 2 ++ torch/lib/c10d/test/CUDATest.cu | 1 + 14 files changed, 36 insertions(+) diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu index 92e89ae5acc2..a37a3fba55a7 100644 --- a/modules/detectron/group_spatial_softmax_op.cu +++ b/modules/detectron/group_spatial_softmax_op.cu @@ -112,6 +112,7 @@ bool GroupSpatialSoftmaxOp::RunOnDevice() { GroupSpatialSoftmaxKernel<<>>( N, A, W, H, Xdata, Pdata, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -158,11 +159,13 @@ bool GroupSpatialSoftmaxGradientOp::RunOnDevice() { SumProbsKernel<<>>( N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 2: dX[i] = dX[i] - s SubSumKernel<<>>( N, A, W, H, sum_probs_.data(), dXdata, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 3: dX[i] = Y[i] * dX[i] math::Mul(Y.size(), dXdata, Ydata, dXdata, &context_); diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu index 1ba418be5c99..68e4ec377d62 100644 --- a/modules/detectron/ps_roi_pool_op.cu +++ b/modules/detectron/ps_roi_pool_op.cu @@ -253,6 +253,7 @@ bool PSRoIPoolOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), output_dim_, group_size_, Y->mutable_data(), A->mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -276,6 +277,7 @@ bool PSRoIPoolGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, output_dim_, dX->mutable_data(), R.data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu index 62948f7eacbe..b261911b95a1 100644 --- a/modules/detectron/roi_pool_f_op.cu +++ b/modules/detectron/roi_pool_f_op.cu @@ -149,6 +149,7 @@ bool RoIPoolFOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), Y->mutable_data(), A->mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -173,6 +174,7 @@ bool RoIPoolFGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, dX->mutable_data(), R.data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; } diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu index 9065bfc7afbe..ce68fcff634d 100644 --- a/modules/detectron/select_smooth_l1_loss_op.cu +++ b/modules/detectron/select_smooth_l1_loss_op.cu @@ -129,6 +129,7 @@ bool SelectSmoothL1LossOp::RunOnDevice() { M, Y_hat.data(), Y.data(), L.data(), buff_.mutable_data(), S.data(), beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Sum of all losses // al := sum_i l_i @@ -175,6 +176,7 @@ bool SelectSmoothL1LossGradientOp::RunOnDevice() { D, H, W, M, Y_hat.data(), Y.data(), L.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_, S.data(), beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu index d69a7b41dc33..bb86560fcb01 100644 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu +++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu @@ -93,6 +93,8 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { T.data(), losses_.mutable_data(), counts_.mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + float* avg_loss_data = avg_loss->mutable_data(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -106,6 +108,7 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, avg_loss_data, normalizer_data, avg_loss_data, &context_); } @@ -135,6 +138,7 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { T.data(), dX->mutable_data(), counts_.mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (normalize_) { float* normalizer_data = normalizer_.mutable_data(); math::Sum( @@ -145,6 +149,7 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, d_avg_loss.data(), diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu index 5b130c8dfc1f..e6f2dea21b5d 100644 --- a/modules/detectron/sigmoid_focal_loss_op.cu +++ b/modules/detectron/sigmoid_focal_loss_op.cu @@ -134,6 +134,7 @@ bool SigmoidFocalLossOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), wp.data(), gamma_, alpha_, num_classes_, losses_.mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -165,6 +166,7 @@ bool SigmoidFocalLossGradientOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), dX->mutable_data(), wp.data(), gamma_, alpha_, num_classes_, d_avg_loss.data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu index 1a3e8b78b53f..ea835a4bc2b9 100644 --- a/modules/detectron/smooth_l1_loss_op.cu +++ b/modules/detectron/smooth_l1_loss_op.cu @@ -102,6 +102,7 @@ bool SmoothL1LossOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), buff_.mutable_data(), beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Element-wise weighted smooth l1 loss (can be used to specify a per-element // loss weight) @@ -164,6 +165,8 @@ bool SmoothL1LossGradientOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_ / N, beta_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + // Element-wise scale by alpha_in and alpha_out math::Mul( d_Y_hat->size(), d_Y_hat->data(), alpha_in.data(), diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu index 93635269f176..b7f8d2423ebc 100644 --- a/modules/detectron/softmax_focal_loss_op.cu +++ b/modules/detectron/softmax_focal_loss_op.cu @@ -176,6 +176,7 @@ bool SoftmaxFocalLossOp::RunOnDevice() { <<>>( N, A, H, W, Xdata, P->mutable_data(), num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Compute loss for each x,y location const int* Tdata = T.data(); @@ -184,6 +185,7 @@ bool SoftmaxFocalLossOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, P->data(), Tdata, losses_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // sum the losses float* avg_loss_data = avg_loss->mutable_data(); @@ -227,6 +229,8 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, Pdata, Tdata, buff_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + // Compute the gradient with the weights const float* Bdata = buff_.data(); SoftmaxFocalLossGradientKernel @@ -234,6 +238,7 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data(), dX->mutable_data(), num_classes_); + C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu index 97ddc492eb07..ff8b5632e80a 100644 --- a/modules/detectron/spatial_narrow_as_op.cu +++ b/modules/detectron/spatial_narrow_as_op.cu @@ -115,6 +115,7 @@ bool SpatialNarrowAsOp::DoRunWithType() { out_width, A.template data(), C->template mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -152,6 +153,7 @@ bool SpatialNarrowAsGradientOp::DoRunWithType() { out_width, dC.template data(), dA->template mutable_data()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu index 38af4254f922..0ea32e348c0b 100644 --- a/modules/detectron/upsample_nearest_op.cu +++ b/modules/detectron/upsample_nearest_op.cu @@ -164,6 +164,8 @@ bool UpsampleNearestOp::RunOnDevice() { upscale<<>>( input_data, output_data, no_elements, scale_, d1, d2, d3); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + return true; } @@ -209,6 +211,7 @@ bool UpsampleNearestGradientOp::RunOnDevice() { math::Set(no_elements, 0.f, gradInput_data, &context_); downscale<<>>( gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/test/cpp_extensions/cuda_extension.cu b/test/cpp_extensions/cuda_extension.cu index 29511af8a0ed..0c23d89df889 100644 --- a/test/cpp_extensions/cuda_extension.cu +++ b/test/cpp_extensions/cuda_extension.cu @@ -6,6 +6,7 @@ #include #include +#include #include @@ -26,4 +27,5 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel.cu b/test/cpp_extensions/cuda_extension_kernel.cu index 660219989863..4a942b0a20af 100644 --- a/test/cpp_extensions/cuda_extension_kernel.cu +++ b/test/cpp_extensions/cuda_extension_kernel.cu @@ -1,5 +1,6 @@ #include #include +#include #include @@ -20,4 +21,5 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel2.cu b/test/cpp_extensions/cuda_extension_kernel2.cu index 817bdf64ac8e..ddb240e5d067 100644 --- a/test/cpp_extensions/cuda_extension_kernel2.cu +++ b/test/cpp_extensions/cuda_extension_kernel2.cu @@ -1,5 +1,6 @@ #include #include +#include #include @@ -20,4 +21,5 @@ void tanh_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; tanh_add_kernel<<>>(x, y, output, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/torch/lib/c10d/test/CUDATest.cu b/torch/lib/c10d/test/CUDATest.cu index c47b29ea536d..88f87492206c 100644 --- a/torch/lib/c10d/test/CUDATest.cu +++ b/torch/lib/c10d/test/CUDATest.cu @@ -17,6 +17,7 @@ __global__ void waitClocks(const uint64_t count) { void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) { waitClocks<<<1, 1, 0, stream.stream()>>>(clocks); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } int cudaNumDevices() { From 909a9060e9f780c5ed7ba769d0fb11902a7b2ddf Mon Sep 17 00:00:00 2001 From: Xiong Wei Date: Thu, 10 Dec 2020 10:56:50 -0800 Subject: [PATCH 130/250] [vmap] implement batching rule for fill_ and zero_ (#48516) Summary: Fix https://github.com/pytorch/pytorch/issues/47755 - This PR implements batching rules for in-place operators `fill_` and `zero_`. - Testcases are added to the `test/test_vmap.py`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48516 Reviewed By: H-Huang Differential Revision: D25431557 Pulled By: zou3519 fbshipit-source-id: 437b0534dc0b818fbe05f7fcfcb649aa677483dc --- aten/src/ATen/BatchingRegistrations.cpp | 31 +++++++++++++++++++++++++ test/test_vmap.py | 31 +++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index 0f9b31efefb9..0731e87f52a2 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -233,6 +233,32 @@ Tensor unsqueeze_batching_rule(const Tensor& self, int64_t dim) { return self_physical.newLogicalFromPhysical(result); } +Tensor& fill_inplace_scalar_batching_rule(Tensor& self, Scalar value) { + auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); + self_physical.tensor().fill_(value); + return self; +} + +Tensor& fill_inplace_tensor_batching_rule(Tensor& self, const Tensor& value) { + auto value_batched = isBatchedTensor(value); + + if (value_batched) { + auto physical_args = + BroadcastingVmapTransform::logicalToPhysical({self, value}); + physical_args[0].tensor().copy_(physical_args[1].tensor()); + } else { + auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); + self_physical.tensor().fill_(value); + } + return self; +} + +Tensor& zero_inplace_batching_rule(Tensor &self) { + auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); + self_physical.tensor().zero_(); + return self; +} + Tensor squeeze_batching_rule(const Tensor& self) { auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); auto physical_sizes = self_physical.tensor().sizes(); @@ -971,6 +997,11 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("is_complex", native::is_complex); m.impl("conj", native::conj); + // inplace operations + m.impl("fill_.Scalar", fill_inplace_scalar_batching_rule); + m.impl("fill_.Tensor", fill_inplace_tensor_batching_rule); + m.impl("zero_", zero_inplace_batching_rule); + // view operations m.impl("as_strided", as_strided_batching_rule); m.impl("chunk", chunk_batching_rule); diff --git a/test/test_vmap.py b/test/test_vmap.py index 9192c00a94d3..5fa8426fd4ab 100644 --- a/test/test_vmap.py +++ b/test/test_vmap.py @@ -1365,6 +1365,37 @@ def test_expand_as(self): test(vmap(op), (torch.rand(B0, B1), torch.rand(B1, 2, 3, 5)), in_dims=(0, None)) test(vmap(vmap(op)), (torch.rand(B0, B1, B2), torch.rand(B0, B1, B2, 2, 3, 5))) + def test_fill_and_zero_inplace(self): + test = functools.partial(self._vmap_test, check_propagates_grad=False) + B0, B1 = 7, 11 + ops = ( + lambda t: t.fill_(0.1), + lambda t: t.fill_(torch.tensor(0.2)), + lambda t: t.zero_(), + ) + + for op in ops: + # Single vmap, various in_dims / out_dims + test(op, [TensorFactory.randn([B0, 3])]) + test(op, [TensorFactory.randn([2, 5, B0, 3])], in_dims=2) + test(op, [TensorFactory.randn([2, 5, B0, 3])], in_dims=2, out_dims=2) + + # Doubly nested vmap + test(vmap(op), [TensorFactory.randn([B0, B1])]) + test(vmap(op), [TensorFactory.randn([B1, 2, 5, B0, 3])], in_dims=2) + test(vmap(op, in_dims=2), [TensorFactory.randn([2, 5, B0, B1, 3])], + in_dims=2, out_dims=2) + + # test when value is a batched tensor for fill_ operator + B0, B1 = 3, 5 + test(Tensor.fill_, [TensorFactory.randn([B0, B1]), TensorFactory.randn(B0)]) + + with self.assertRaisesRegex(RuntimeError, + r"output with shape .+ doesn't match the broadcast shape"): + # Runtime Error is thrown when the tensor being written to isn't being vmapped over + vmap(Tensor.fill_, (None, 0))(TensorFactory.randn([B0, B1]), + TensorFactory.randn([B0])) + def _test_complex_views(self, op, dtypes): test = self._vmap_view_test From edbf9263ad3aaa834314429a1348ef874f6b28ac Mon Sep 17 00:00:00 2001 From: Yuchen Huang Date: Thu, 10 Dec 2020 11:00:50 -0800 Subject: [PATCH 131/250] [iOS] Bump up the cocoapods version (#49176) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49176 Bump up the cocoapods version ghstack-source-id: 118305636 Test Plan: CI Reviewed By: xta0 Differential Revision: D25466321 fbshipit-source-id: 916adc514c5edc8971445da893362a160cfc092b --- ios/LibTorch.podspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ios/LibTorch.podspec b/ios/LibTorch.podspec index 236f1de7988f..b90cf6aff5d6 100644 --- a/ios/LibTorch.podspec +++ b/ios/LibTorch.podspec @@ -1,6 +1,6 @@ Pod::Spec.new do |s| s.name = 'LibTorch' - s.version = '1.7.0' + s.version = '1.7.1' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/pytorch' From 2519348f60859cb076219ef96c4470e333885622 Mon Sep 17 00:00:00 2001 From: Yuchen Huang Date: Thu, 10 Dec 2020 11:04:16 -0800 Subject: [PATCH 132/250] [Binary Push] Update the awscli installation, use conda install rather than brew install (#49175) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49175 As title ghstack-source-id: 118306312 Test Plan: CI Reviewed By: xta0 Differential Revision: D25466577 fbshipit-source-id: 67a521947db3744695f0ab5f421483ab96d8ed9f --- .circleci/scripts/binary_ios_upload.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.circleci/scripts/binary_ios_upload.sh b/.circleci/scripts/binary_ios_upload.sh index b530521f7f2d..f1022e113fa4 100644 --- a/.circleci/scripts/binary_ios_upload.sh +++ b/.circleci/scripts/binary_ios_upload.sh @@ -34,7 +34,13 @@ touch version.txt echo $(date +%s) > version.txt zip -r ${ZIPFILE} install src version.txt LICENSE # upload to aws -brew install awscli +# Install conda then 'conda install' awscli +curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh +chmod +x ~/conda.sh +/bin/bash ~/conda.sh -b -p ~/anaconda +export PATH="~/anaconda/bin:${PATH}" +source ~/anaconda/bin/activate +conda install -c conda-forge awscli --yes set +x export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD} export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD} From 18c03b9f00b084bd04193a02a66a8bdd01d23931 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Thu, 10 Dec 2020 11:35:16 -0800 Subject: [PATCH 133/250] make duplicate def() calls an error in the dispatcher (#48098) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48098 Test Plan: Imported from OSS *** make duplicate def() calls an error in the dispatcher. Updating all fb operators to use the new dispatcher registration API Reviewed By: ezyang Differential Revision: D25056089 Pulled By: bdhirsh fbshipit-source-id: 8d7e381f16498a69cd20e6955d69acdc9a1d2791 --- aten/src/ATen/core/dispatch/Dispatcher.cpp | 31 ++++------------------ aten/src/ATen/native/quantized/library.cpp | 1 - 2 files changed, 5 insertions(+), 27 deletions(-) diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 5184e8c5f698..5e3e91afbb45 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -134,13 +134,11 @@ RegistrationHandleRAII Dispatcher::registerDef(FunctionSchema schema, std::strin OperatorName op_name = schema.operator_name(); auto op = findOrRegisterName_(op_name); - if (op.operatorIterator_->def_count == 0) { - // NB: registerSchema is not idempotent! Only do it once! - op.operatorIterator_->op.registerSchema(std::move(schema), std::move(debug)); - listeners_->callOnOperatorRegistered(op); - } else { - checkSchemaCompatibility(op, schema, debug); - } + TORCH_CHECK(op.operatorIterator_->def_count == 0, "Tried to register an operator (", schema, ") with the same name and overload name multiple times.", + " Each overload's schema should only be registered with a single call to def().", + " Duplicate registration: ", debug, ". Original registration: ", op.operatorIterator_->op.debug()); + op.operatorIterator_->op.registerSchema(std::move(schema), std::move(debug)); + listeners_->callOnOperatorRegistered(op); // NB: do not increment the counts until AFTER error checking ++op.operatorIterator_->def_count; @@ -151,25 +149,6 @@ RegistrationHandleRAII Dispatcher::registerDef(FunctionSchema schema, std::strin }); } -void Dispatcher::checkSchemaCompatibility(const OperatorHandle& op, const FunctionSchema& schema, const std::string& debug) { - TORCH_CHECK(op.schema() == schema, "Tried to register multiple operators with the same name and the same overload name but different schemas: ", schema, " (", debug, ") vs ", op.schema(), " (", op.debug(), ")"); - if (schema.isDefaultAliasAnalysisKind()) { - // [BACKWARDS COMPAT] If the *new* schema is the default alias analysis - // kind, for BC, we will accept it. If we don't accept it, most extensions - // that override existing operators will stop working (as they generally did - // not specify alias information). - } else if (op.schema().isDefaultAliasAnalysisKind()) { - // [BACKWARDS COMPAT] If you POST-FACTO specify a non-default alias analysis - // kind after we already have a schema for a function, bong it in for BC - // reasons. - op.operatorIterator_->op.updateSchemaAliasAnalysis(schema.aliasAnalysis()); - } else { - TORCH_CHECK(op.schema().aliasAnalysis() == schema.aliasAnalysis(), - "Tried to define the schema for ", toString(op.operator_name()), " with different alias analysis kinds: ", - toString(op.schema().aliasAnalysis()), " (", op.debug(), ") vs ", toString(schema.aliasAnalysis()), " (", debug, ")"); - } -} - void Dispatcher::deregisterDef_(const OperatorHandle& op, const OperatorName& op_name) { // we need a lock to avoid concurrent writes std::lock_guard lock(mutex_); diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index 91a275a6aecf..2c8a6d4e4946 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -132,7 +132,6 @@ TORCH_LIBRARY(quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_byte(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase weight, Tensor indices, bool pruned_weights=False) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::celu(Tensor self, float output_scale, int output_zero_point, Scalar alpha=1) -> Tensor")); - m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::group_norm(Tensor input, int num_groups, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::hardswish(Tensor input, float output_scale, int output_zero_point) -> Tensor")); m.def(TORCH_SELECTIVE_SCHEMA("quantized::instance_norm(Tensor input, Tensor? weight, Tensor? bias, float eps, float output_scale, int output_zero_point) -> Tensor")); From 70853c502143c65625977768e3ca68284ed0cbfa Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 10 Dec 2020 12:12:56 -0800 Subject: [PATCH 134/250] Dont use symbolic shapes check (#47810) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47810 `bindSymbolicShapes` wasn't checking device or dtype at all, so it wasn't correct. It also isn't being used anywhere (num_profiles is always 1 and we don't use symbolic shapes). We shouldn't have it on until we are actually using symoblic shapes. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D25286214 Pulled By: eellison fbshipit-source-id: 10fb175d0c75bd0159fb63aafc3b59cc5fd6c5af --- torch/csrc/jit/runtime/interpreter.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 4802fd2efafa..f8f35d1aa818 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1416,10 +1416,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { auto t = input.toTensor(); const TypePtr& expected = frame.function->type_table_[inst.X + i]; auto expected_type = expected->cast(); - if (t.defined() && - (!frames.back().symbols2dims.bindSymbolicShapes( - t.sizes(), expected_type->symbolic_sizes()) || - !expected_type->matchTensor(t))) { + if (t.defined() && !expected_type->matchTensor(t)) { push(stack, false); break; } From 0e666a9f5a432a097332caa31a81e5b52890b1c3 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 10 Dec 2020 12:12:56 -0800 Subject: [PATCH 135/250] [TensorExpr] Cache use of fallback in kernel invocation (#47812) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47812 Previously we were checking the environment every kernel invocation for `tensorExprFuserEnabled`, which checks the environment for `PYTORCH_TENSOREXPR`. This is only a dev-exposed API, so I think it is fine to only check once when the kernel is initialized. The `disable_optimization` flag which is user-exposed more or less covers the same functionality. For fun, some benchmarking. I compared scripted before and after of ``` def foo(x, y): return x + y ``` for x, y = torch.tensor([1]). I also removed the prim::TypeCheck node to better isolate the kernel (I cheated). Here is gist: https://gist.github.com/eellison/39f3bc368f5bd1f25ded4827feecd15e Without Changes Run 1: no fusion: sum 6.416894399004377 min: 0.6101883250012179 median 0.6412974080012646 with fusion: sum 6.437897570998757 min: 0.6350401220006461 median 0.6446951820034883 Without Changes Run2: no fusion: sum 6.601341788002173 min: 0.6292048720024468 median 0.6642187059987918 with fusion: sum 6.734651455997664 min: 0.6365462899993872 median 0.6755226659988693 With Changes Run1: no fusion: sum 6.097717430002376 min: 0.5977709550024883 median 0.613631643998815 with fusion: sum 6.1299369639964425 min: 0.5857932209983119 median 0.6159247440009494 With Changes Run2: no fusion: sum 6.5672018059995025 min: 0.6245676209982776 median 0.6386050750006689 with fusion: sum 6.489086147994385 min: 0.6236886289989343 median 0.6535737619997235 Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D25286210 fbshipit-source-id: a18b4918a7f7bed8a39112ae04b678e79026d39b --- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 11 +------ torch/csrc/jit/tensorexpr/kernel.cpp | 35 +++++++++++----------- torch/csrc/jit/tensorexpr/kernel.h | 3 +- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index c53a71eb02e8..917d88a39605 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -1028,16 +1028,7 @@ Operation createTensorExprOp(const Node* node) { std::make_shared(node->g(attr::Subgraph)); return [kernel](Stack* stack) { RECORD_FUNCTION("TensorExpr", std::vector()); - if (!tensorexpr::fallbackAllowed()) { - kernel->run(*stack); - return 0; - } - - try { - kernel->run(*stack); - } catch (const std::runtime_error& e) { - kernel->fallback(*stack); - } + kernel->run(*stack); return 0; }; } diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 2eb2a81b69eb..5fe290ac4193 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1929,36 +1929,35 @@ void TensorExprKernel::compile() { TensorExprKernel::TensorExprKernel(const std::shared_ptr& subgraph) : graph_(subgraph), code_(subgraph, "") { - if (!fallbackAllowed()) { + + allow_fallback_ = fallbackAllowed(); + if (!allow_fallback_) { compile(); return; } + use_fallback_ = fallbackEnforced(); + if (use_fallback_) { + return; + } + try { compile(); } catch (...) { - fallback_ = true; + use_fallback_ = true; } } void TensorExprKernel::run(Stack& stack) { - if (fallbackEnforced()) { - fallback(stack); - return; - } - if (!fallbackAllowed()) { - runKernel(stack); - return; - } - - if (fallback_) { - fallback(stack); - return; - } - try { + if (!use_fallback_ && !allow_fallback_) { runKernel(stack); - } catch (...) { - fallback_ = true; + } else if (!use_fallback_ && allow_fallback_) { + try { + runKernel(stack); + } catch (...) { + fallback(stack); + } + } else { fallback(stack); } } diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 54a876eb85c6..c176ffd2d3b1 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -198,7 +198,8 @@ class TORCH_API TensorExprKernel { std::vector inputTypes_; std::shared_ptr graph_; Code code_; - bool fallback_{false}; + bool allow_fallback_{false}; + bool use_fallback_{false}; bool hasRandom_{false}; bool hasBroadcast_{false}; std::unordered_map> From 413caa7fd242ce2567179724fc1b6482430e0826 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 10 Dec 2020 12:12:56 -0800 Subject: [PATCH 136/250] [NNC] Compute Tensor Output Properties in ininitialization (#47813) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47813 We have some code paths that at kernel invocation seem to handle dynamic sizes, but I'm not sure how well it works because we have other parts of our code base that assume that tenso shapes are always fully specified. https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/tensorexpr/kernel.cpp#L1572 As with some other PRs in the stack, I think it would be good to remove the features that aren't on/actively being worked on while they are not used. I initially did this PR to try to speed up perf. I couldn't observe too much of a speed up, so we can decide to keep drop this PR if we want. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D25286212 Pulled By: eellison fbshipit-source-id: 4ae66e0af88d649dd4e592bc78686538c2fdbaeb --- test/cpp/tensorexpr/test_kernel.cpp | 91 ++++++++++++++++++++-------- torch/csrc/jit/tensorexpr/kernel.cpp | 54 +++++++---------- torch/csrc/jit/tensorexpr/kernel.h | 2 + 3 files changed, 90 insertions(+), 57 deletions(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index debee0596489..26e9e3326f70 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -138,7 +138,9 @@ TEST(Kernel, _3) { } } -TEST(Kernel, _4) { +TEST(Kernel, DISABLED_Shape_Inference) { + // disabled: doesn't do stride propagation, and isn't being used currently + // Test TensorExpr shape inference capabilities: it should only require shapes // for the inputs { @@ -396,7 +398,7 @@ TEST(Kernel, CatInputTypesPromotion) { %c : Double(5, 9, 2, strides=[18, 2, 1], device=cpu)): %dim : int = prim::Constant[value=1]() %inputs : Tensor[] = prim::ListConstruct(%a, %b, %c) - %r : Tensor = aten::cat(%inputs, %dim) # new size: [5,19,2] + %r : Double(5, 19, 2, strides=[38, 2, 1]) = aten::cat(%inputs, %dim) return (%r))IR"; auto graph = std::make_shared(); parseIR(graph_string, &*graph); @@ -465,7 +467,12 @@ at::Tensor iotaTensor(IntArrayRef sizes, const at::TensorOptions& options) { } // namespace -TEST(Kernel, SumAllAxes) { +TEST(Kernel, DISABLED_SumAllAxes) { + // [zero-dim tensors] + // NNC does not yet handle zero-dim tensors. aten::sum with no axis + // input returns a zero-dim tensors, so these tests must be disabled + // until we add support for zero-dim tensors. + // Test lowering of sum on all axes. const auto graph_template = R"IR( graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)): @@ -512,6 +519,19 @@ TEST(Kernel, SumAllAxes) { } } +std::string li_to_str(at::ArrayRef li) { + std::stringstream out; + bool first = true; + for (auto elem: li) { + if (!first) { + out << ", "; + } + out << elem; + first = false; + } + return out.str(); +} + TEST(Kernel, SumOneAxis) { // Test lowering of sum on one axis. const auto graph_template = R"IR( @@ -519,7 +539,7 @@ TEST(Kernel, SumOneAxis) { %1 : int[] = prim::Constant[value=[${dim}]]() %2 : bool = prim::Constant[value=${keepdim}]() %3 : ${dtype} - %4 : Tensor = aten::sum(%0, %1, %2, %3) + %4 : ${out_dtype}(${size}, strides=[${strides}], device=cpu) = aten::sum(%0, %1, %2, %3) return (%4))IR"; auto a = iotaTensor({5, 3}, TensorOptions(kCPU).dtype(at::kFloat)); @@ -531,17 +551,23 @@ TEST(Kernel, SumOneAxis) { env.d("dim", dim); env.d("keepdim", keepdim); env.s("dtype", dtypeConstant(scalar_type)); - const auto graph_string = format(graph_template, env); - - auto graph = std::make_shared(); - parseIR(graph_string, &*graph); - - auto o = at::empty({}, TensorOptions(kCPU)); c10::optional dtype; if (scalar_type != ScalarType::None) { dtype = static_cast(scalar_type); } auto ref = a.sum({dim}, /*keepdim=*/keepdim, /*dtype=*/dtype); + if (scalar_type == ScalarType::None) { + env.s("out_dtype", "Float"); + } else { + env.s("out_dtype", "Double"); + } + env.s("size", li_to_str(ref.sizes())); + env.s("strides", li_to_str(ref.strides())); + const auto graph_string = format(graph_template, env); + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto o = at::empty({}, TensorOptions(kCPU)); TensorExprKernel k(graph); std::vector inputs = {a}; Stmt* s = k.getCodeGenStmt(); @@ -578,7 +604,7 @@ TEST(Kernel, SumMultipleAxes) { %3 : int[] = prim::ListConstruct(%1, %2) %4 : bool = prim::Constant[value=${keepdim}]() %5 : ${dtype} - %6 : Tensor = aten::sum(%0, %3, %4, %5) + %6 : Float(${size}, strides=[${strides}]) = aten::sum(%0, %3, %4, %5) return (%6))IR"; auto a = iotaTensor({2, 3, 2, 3}, TensorOptions(kCPU).dtype(at::kFloat)); @@ -593,13 +619,18 @@ TEST(Kernel, SumMultipleAxes) { env.d("dim2", dim2); env.d("keepdim", keepdim); env.s("dtype", dtypeConstant(ScalarType::None)); + + auto o = at::empty({}, TensorOptions(kCPU)); + auto ref = a.sum(IntArrayRef{dim1, dim2}, /*keepdim=*/keepdim); + + env.s("size", li_to_str(ref.sizes())); + env.s("strides", li_to_str(ref.strides())); + const auto graph_string = format(graph_template, env); auto graph = std::make_shared(); parseIR(graph_string, &*graph); - auto o = at::empty({}, TensorOptions(kCPU)); - auto ref = a.sum(IntArrayRef{dim1, dim2}, /*keepdim=*/keepdim); TensorExprKernel k(graph); std::vector inputs = {a}; Stmt* s = k.getCodeGenStmt(); @@ -636,7 +667,7 @@ TEST(Kernel, Softmax2D) { graph(%0 : Float(5, 3, strides=[3, 1], device=cpu)): %1 : int = prim::Constant[value=${dim}]() %2 : int = prim::Constant[value=7]() - %3 : Tensor = aten::${op}(%0, %1, %2) + %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2) return (%3))IR"; auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat)); @@ -657,11 +688,16 @@ TEST(Kernel, Softmax2D) { for (int softmax_dim = 0; softmax_dim < a.dim(); ++softmax_dim) { auto softmax_dim_size = a.sizes()[softmax_dim]; auto other_dim = (softmax_dim + 1) % a.dim(); + auto ref = + log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); env.s("op", log_softmax ? "log_softmax" : "softmax"); + env.s("size", li_to_str(ref.sizes())); + env.s("strides", li_to_str(ref.strides())); + const auto graph_string = format(graph_template, env); auto graph = std::make_shared(); @@ -685,8 +721,6 @@ TEST(Kernel, Softmax2D) { std::vector stack = fmap(inputs); k.run(stack); auto output = stack[0].toTensor(); - auto ref = - log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); ASSERT_EQ(output.sizes(), ref.sizes()); ASSERT_TRUE(at::allclose(output, ref)); } @@ -698,7 +732,7 @@ TEST(Kernel, Softmax3D) { graph(%0 : Float(3, 4, 5, strides=[20, 5, 1], device=cpu)): %1 : int = prim::Constant[value=${dim}]() %2 : int = prim::Constant[value=7]() - %3 : Tensor = aten::${op}(%0, %1, %2) + %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2) return (%3))IR"; auto a = at::rand({3, 4, 5}, TensorOptions(kCPU).dtype(at::kFloat)); @@ -727,11 +761,16 @@ TEST(Kernel, Softmax3D) { other_dims.push_back(i); } } + auto ref = + log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); env.s("op", log_softmax ? "log_softmax" : "softmax"); + env.s("size", li_to_str(ref.sizes())); + env.s("strides", li_to_str(ref.strides())); + const auto graph_string = format(graph_template, env); auto graph = std::make_shared(); @@ -758,8 +797,6 @@ TEST(Kernel, Softmax3D) { k.run(stack); auto output = stack[0].toTensor(); - auto ref = - log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); ASSERT_EQ(output.sizes(), ref.sizes()); ASSERT_TRUE(at::allclose(output, ref)); } @@ -771,7 +808,7 @@ TEST(Kernel, Softmax4D) { graph(%0 : Float(2, 3, 2, 3, strides=[18, 6, 3, 1], device=cpu)): %1 : int = prim::Constant[value=${dim}]() %2 : int = prim::Constant[value=7]() - %3 : Tensor = aten::${op}(%0, %1, %2) + %3 : Float(${size}, strides=[${strides}]) = aten::${op}(%0, %1, %2) return (%3))IR"; auto a = at::rand({2, 3, 2, 3}, TensorOptions(kCPU).dtype(at::kFloat)); @@ -803,11 +840,16 @@ TEST(Kernel, Softmax4D) { other_dims.push_back(i); } } + auto ref = + log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); env.s("op", log_softmax ? "log_softmax" : "softmax"); + env.s("size", li_to_str(ref.sizes())); + env.s("strides", li_to_str(ref.strides())); + const auto graph_string = format(graph_template, env); auto graph = std::make_shared(); @@ -835,15 +877,14 @@ TEST(Kernel, Softmax4D) { std::vector stack = fmap(inputs); k.run(stack); auto output = stack[0].toTensor(); - auto ref = - log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); ASSERT_EQ(output.sizes(), ref.sizes()); ASSERT_TRUE(at::allclose(output, ref)); } } } -TEST(Kernel, InlineProducerIntoReduction) { +TEST(Kernel, DISABLED_InlineProducerIntoReduction) { + // see : [zero-dim tensors] KernelScope kernel_scope; // Inline producer (mul) into reduction (sum). @@ -882,7 +923,9 @@ TEST(Kernel, InlineProducerIntoReduction) { ASSERT_TRUE(at::allclose(o, ref)); } -TEST(Kernel, InlineReductionIntoConsumer) { +TEST(Kernel, DISABLED_InlineReductionIntoConsumer) { + // see : [zero-dim tensors] + KernelScope kernel_scope; // Inline producer (mul %2) into reduction (sum %4) but DO NOT diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 5fe290ac4193..30a3a3e7fe68 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1903,16 +1903,29 @@ void TensorExprKernel::compile() { } } + device_ = *pickDeviceType(graph_->inputs()); + // Move output operands from `tensors_` to `tensorOutputs_` for (const auto& output : graph_->outputs()) { if (!tensors_.count(output->unique())) { throw malformed_input("cannot find output Tensor"); } + auto tensor_sizes = output->type()->expect()->sizes(); + std::vector size; + TORCH_INTERNAL_ASSERT( + tensor_sizes.sizes().has_value(), "Expected output size: ", output); + for (const auto& elem : *tensor_sizes.sizes()) { + TORCH_INTERNAL_ASSERT(elem, "expected all output values defined"); + size.push_back(*elem); + } + tensorOutputSizes_.push_back(size); tensorOutputs_.emplace_back(tensors_.at(output->unique())); + tensorOutputTensorOptions_.push_back( + c10::TensorOptions(tensorType(tensors_[output->unique()])) + .device(device_)); tensors_.erase(output->unique()); } - device_ = *pickDeviceType(graph_->inputs()); BackendType backendType = inferBackendTypeFromDevice(device_); Stmt* stmt = generateStmt(backendType); // Set up formal params (inputs, then outputs) for kernel. @@ -1929,7 +1942,6 @@ void TensorExprKernel::compile() { TensorExprKernel::TensorExprKernel(const std::shared_ptr& subgraph) : graph_(subgraph), code_(subgraph, "") { - allow_fallback_ = fallbackAllowed(); if (!allow_fallback_) { compile(); @@ -1965,47 +1977,23 @@ void TensorExprKernel::run(Stack& stack) { std::vector TensorExprKernel::prepareRunArgs( const at::ArrayRef& inputs, std::vector& outputs) { - std::map varToSize; - std::vector runArgs; - for (size_t i = 0; i < inputs.size(); i++) { + runArgs.reserve(inputs.size() + tensorOutputs_.size()); + + for (size_t i = 0, e = inputs.size(); i < e; i++) { auto const& input = inputs[i]; if (input.isInt()) { runArgs.emplace_back((int32_t)input.toInt()); } else if (input.isDouble()) { runArgs.emplace_back((float)input.toDouble()); } else if (input.isTensor()) { - auto const& tensor = input.toTensor(); - runArgs.emplace_back(tensor.data_ptr()); - for (auto const& size : kernelArgs_[i].sizes()) { - int32_t s = tensor.sizes()[size.idx]; - runArgs.emplace_back(s); - varToSize[size.var.node()] = s; - } - for (auto const& stride : kernelArgs_[i].strides()) { - int32_t s = tensor.strides()[stride.idx]; - runArgs.emplace_back(s); - } + runArgs.emplace_back(input.toTensor().data_ptr()); } } - for (auto& o : tensorOutputs_) { - std::vector tensorSize; - for (const Expr* dim : o->dims()) { - auto it = varToSize.find(dim); - if (it != varToSize.end()) { - tensorSize.push_back(it->second); - } else { - const IntImm* s = dynamic_cast(dim); - if (!s) { - throw malformed_input("output expected Int", dim); - } - tensorSize.push_back(s->value()); - } - } - - outputs.push_back(at::empty( - tensorSize, c10::TensorOptions(tensorType(o)).device(device_))); + for (size_t i = 0, e = tensorOutputs_.size(); i < e; ++i) { + outputs.push_back( + at::empty(tensorOutputSizes_[i], tensorOutputTensorOptions_[i])); runArgs.emplace_back(outputs.back().data_ptr()); } return runArgs; diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index c176ffd2d3b1..7408060752ee 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -189,6 +189,8 @@ class TORCH_API TensorExprKernel { int64_t nInputs_ = 0; std::vector kernelArgs_; + std::vector> tensorOutputSizes_; + std::vector tensorOutputTensorOptions_; std::vector tensorOutputs_; std::unordered_map tensors_; std::unordered_map scalars_; From 71ddc0ba19ad0471bce31d334d4885b81317aaf7 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 10 Dec 2020 12:12:56 -0800 Subject: [PATCH 137/250] [TensorExpr Fuser] Add support for nodes which have tensor constant inputs (#47814) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47814 Previously, we would bail completely if a node had a constant tensor input. This PR adds support for this case by lifting the constant out of the fusion graph after we've done fusion. It might be nice to add support for Tensor Constants in NNC itself, but it looked kind of tricky and this is an easy enough temporary solution. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D25286215 Pulled By: eellison fbshipit-source-id: 9ff67f92f5a2d43fd3ca087569898666525ca8cf --- test/jit/test_profiler.py | 17 ++++++++++- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 35 +++++++++++++++------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py index e42f8225a3d6..e763a730c473 100644 --- a/test/jit/test_profiler.py +++ b/test/jit/test_profiler.py @@ -25,7 +25,8 @@ def setUp(self): self.default_dtype = torch.get_default_dtype() self.old_reduction_enabled = torch._C._jit_set_texpr_reductions_enabled(True) torch.set_default_dtype(torch.double) - + self.old_fusion_inlining = torch._C._debug_get_fusion_group_inlining() + torch._C._debug_set_fusion_group_inlining(False) def tearDown(self): torch._C._jit_set_profiling_executor(self.prev_exec) @@ -35,6 +36,7 @@ def tearDown(self): torch._C._jit_override_can_fuse_on_cpu(self.can_fuse_on_cpu) torch.set_default_dtype(self.default_dtype) torch._C._jit_set_texpr_reductions_enabled(self.old_reduction_enabled) + torch._C._debug_set_fusion_group_inlining(self.old_fusion_inlining) def test_tensor_type_not_determined_by_inputs(self): @torch.jit.script @@ -212,6 +214,19 @@ def foo(a, b): g = torch.jit.last_executed_optimized_graph() FileCheck().check("fallback_function").check_next("CallFunction").run(g) + def test_tensor_constant(self): + def foo(a, b): + return a + b + torch.tensor([2]) + + x = torch.ones(1, requires_grad=False) + foo_script = torch.jit.script(foo) + foo_script(x, x) + foo_script(x, x) + + self.assertEqual(foo_script(x, x), foo(x, x)) + g = torch.jit.last_executed_optimized_graph() + FileCheck().check_count("aten::add", 2, exactly=True).run(g) + def test_iterative_fusion(self): @torch.jit.script def foo(a, b, c, d): diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 917d88a39605..6f587b910866 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -459,7 +459,7 @@ class TensorExprFuser { // fusion is done. inlineSmallFusionGroups(graph_->block()); GRAPH_DUMP("After inlining small fusion groups: ", graph_); - guardFusionGroupsAndRemoveOutputs(graph_->block()); + prepareFusionGroupAndGuardOutputs(graph_->block()); GRAPH_DUMP("After guarding fusion groups: ", graph_); removeTensorTypeSpecializations(graph_->block()); GRAPH_DUMP("After removing tensor type specializations: ", graph_); @@ -763,17 +763,10 @@ class TensorExprFuser { } bool canHandle(Node* node) { - REQ(node->kind() != prim::Constant); REQ(disable_shape_checks_ || allShapesAreKnown(node)); REQ(isFusableOnDevice(node)); - // Don't include nodes whose inputs are tensor constants - we cannot handle - // them at the moment. - // TODO: actually support tensor constants and remove this. for (Value* input : node->inputs()) { - if (input->node()->kind() == prim::Constant) { - REQ(!input->type()->cast()) - } if (auto const& tt = input->type()->cast()) { auto st = tt->scalarType(); if (!st) { @@ -975,11 +968,32 @@ class TensorExprFuser { } } - void guardFusionGroupsAndRemoveOutputs(Block* block) { + // TODO: support constant tensors instead of setting them as input + void liftTensorConstantsFromFusionGroups(Node* fusion_group) { + auto subgraph = SubgraphUtils::getSubgraph(fusion_group); + WithInsertPoint guard(fusion_group); + for (auto it = subgraph->block()->nodes().begin(); + it != subgraph->block()->nodes().end(); + ++it) { + auto n = *it; + if (n->kind() == prim::Constant && + n->output()->type()->cast()) { + auto constant = + fusion_group->owningGraph()->insertConstant(*toIValue(n->output())); + fusion_group->addInput(constant); + auto inputToGraph = subgraph->addInput(); + inputToGraph->setType(n->output()->type()); + n->output()->replaceAllUsesWith(inputToGraph); + it.destroyCurrent(); + } + } + } + + void prepareFusionGroupAndGuardOutputs(Block* block) { std::vector fusion_groups; for (Node* n : block->nodes()) { for (Block* b : n->blocks()) { - guardFusionGroupsAndRemoveOutputs(b); + prepareFusionGroupAndGuardOutputs(b); } if (n->kind() == prim::TensorExprGroup) { fusion_groups.push_back(n); @@ -987,6 +1001,7 @@ class TensorExprFuser { } for (Node* fusion_group : fusion_groups) { removeOutputsUsedOnlyInSize(fusion_group); + liftTensorConstantsFromFusionGroups(fusion_group); guardFusionGroup(fusion_group); } } From 0b9d5e65e40fd3fdced0957f8ccf2678f4d673d9 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 10 Dec 2020 12:12:56 -0800 Subject: [PATCH 138/250] Remove inferred from tensor type ctors (#48263) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48263 The inferred type is only used once in `getInferred` and is confusing next to the other parameters. It has nothing to do with runtime values, it just means the type was inferred in type-checking. There are a bunch of parameters and overloads of Tensor instantiation as is. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D25286211 Pulled By: eellison fbshipit-source-id: 3dfc44ab7ff4fbf0ef286ae8716a4afac646804b --- aten/src/ATen/core/jit_type.h | 14 ++++++++------ aten/src/ATen/core/type.cpp | 6 ++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 40c2ec7f443d..577e539b935c 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -630,8 +630,7 @@ struct CAFFE2_API TensorType : public Type { const SymbolicShape& sizes, const VaryingShape& stride_, c10::optional requires_grad, - c10::optional undefined = false, - bool is_inferred = false); + c10::optional undefined = false); static TensorTypePtr create( c10::optional scalar_type, @@ -776,10 +775,13 @@ struct CAFFE2_API TensorType : public Type { static TensorTypePtr getInferred() { static auto valueInferred = TensorType::create( - /*scalar_type=*/{}, /*device=*/{}, - /*sizes=*/SymbolicShape(), - /*stride=*/VaryingShape{}, /*requires_grad=*/{}, - /*undefined=*/false, /*is_inferred=*/true); + /*scalar_type=*/{}, + /*device=*/{}, + /*sizes=*/SymbolicShape(), + /*stride=*/VaryingShape{}, + /*requires_grad=*/{}, + /*undefined=*/false); + valueInferred->is_inferred_ = true; return valueInferred; } diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 276e3a6838a3..429007e4242b 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -978,11 +978,9 @@ TensorTypePtr TensorType::create( const SymbolicShape& sizes, const VaryingShape& strides, c10::optional requires_grad, - c10::optional undefined, - bool is_inferred) { - auto pt = TensorTypePtr(new TensorType( + c10::optional undefined) { + auto pt = TensorTypePtr(new TensorType( scalar_type, device, sizes, strides, requires_grad, undefined)); - pt->is_inferred_ = is_inferred; return pt; } From 3b57be176e180d8aa75b1bdbf51e58408255c96e Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Thu, 10 Dec 2020 12:12:56 -0800 Subject: [PATCH 139/250] [NNC] Preserve strided output (#48264) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48264 Preserves the strided representation of NNC Tensor outputs by transforming them into the right layout at the end of the kernel. Fix for https://github.com/pytorch/pytorch/issues/45604 Test Plan: Imported from OSS Reviewed By: nikithamalgifb Differential Revision: D25286213 Pulled By: eellison fbshipit-source-id: 64d94ac463741e2568a1c9d44174e15ea26e511f --- aten/src/ATen/core/jit_type.h | 22 ++--- test/cpp/tensorexpr/test_kernel.cpp | 2 - test/test_jit_fuser_te.py | 13 +-- test/test_tensorexpr.py | 33 ++++++++ torch/csrc/jit/tensorexpr/kernel.cpp | 116 ++++++++++++++++++++++++--- torch/csrc/jit/tensorexpr/kernel.h | 3 + 6 files changed, 160 insertions(+), 29 deletions(-) diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 577e539b935c..1736ea91d71e 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -810,6 +810,17 @@ struct CAFFE2_API TensorType : public Type { static const TypeKind Kind = TypeKind::TensorType; + static std::vector contiguousStridesOf(at::IntArrayRef sizes) { + std::vector strides(sizes.size()); + if (sizes.empty()) // zero-dim case + return strides; + strides.back() = 1; + for (size_t i = strides.size() - 1; i > 0; i--) { + strides[i - 1] = strides[i] * sizes[i]; + } + return strides; + } + private: TensorType( c10::optional scalar_type, @@ -824,17 +835,6 @@ struct CAFFE2_API TensorType : public Type { scalar_type_, device_, sizes_, strides_, requires_grad_, undefined_)); } - static std::vector contiguousStridesOf(at::IntArrayRef sizes) { - std::vector strides(sizes.size()); - if (sizes.empty()) // zero-dim case - return strides; - strides.back() = 1; - for (size_t i = strides.size() - 1; i > 0; i--) { - strides[i - 1] = strides[i] * sizes[i]; - } - return strides; - } - static VaryingShape computeStrideProps( at::IntArrayRef sizes, at::IntArrayRef strides, diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 26e9e3326f70..895b025ac4e0 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -619,7 +619,6 @@ TEST(Kernel, SumMultipleAxes) { env.d("dim2", dim2); env.d("keepdim", keepdim); env.s("dtype", dtypeConstant(ScalarType::None)); - auto o = at::empty({}, TensorOptions(kCPU)); auto ref = a.sum(IntArrayRef{dim1, dim2}, /*keepdim=*/keepdim); @@ -690,7 +689,6 @@ TEST(Kernel, Softmax2D) { auto other_dim = (softmax_dim + 1) % a.dim(); auto ref = log_softmax ? a.log_softmax(softmax_dim) : a.softmax(softmax_dim); - KernelScope kernel_scope; TemplateEnv env; env.d("dim", softmax_dim); diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index bd5f7ae3af6e..f0552f0f7a36 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1694,13 +1694,14 @@ def eager(t1, t2, t3, t4, t: float): t = torch.rand(8, dtype=torch.float, device='cuda') scripted = self.checkScript(eager, (t, t, t, t, 0.1)) - @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") def test_chunk_mul_one(self): - def eager(x): - z, y, w = torch.chunk(x, 3, -1) - return z * 3, y, w - x = torch.rand(64, 1, 3072, dtype=torch.float, device='cuda') - script = self.checkScript(eager, (x,)) + for device in self.devices: + def eager(x): + z, y, w = torch.chunk(x, 3, -1) + return z * 3, y, w + x = torch.rand(64, 1, 3072, dtype=torch.float, device=device) + z, y, w = eager(x) + script = self.checkScript(eager, (x,)) @unittest.skipIf(not RUN_CUDA, "fuser requires CUDA") def test_eq_unsqueeze_type_as(self): diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index 6ab7899025c7..eada68c9ff92 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -1489,6 +1489,39 @@ def simple(a, b): torch._C._jit_set_te_generate_block_code(val) torch._C._jit_texpr_set_fallback_allowed(fall_bk) + def test_strided_output_preserved(self): + def foo(a, b): + return a + b - a + + # smaller, easier to debug example + x = torch.arange(6) + x = torch.as_strided(x, (2, 3), (1, 2)) + total = 0 + for i in range(2): + for j in range(3): + x[i, j] = total + total += 1 + foo_script = torch.jit.script(foo) + foo_script(x, x) + foo_script(x, x) + out_s = foo_script(x, x) + out_eager = foo(x, x) + self.assertEqual(out_s, out_eager) + self.assertEqual(out_s.stride(), out_eager.stride()) + self.assertLastGraphAllFused() + + # more dims + N, C, H, W, = 2, 3, 4, 5 + x = torch.rand(N, C, H, W).to(memory_format=torch.channels_last) + foo_script = torch.jit.script(foo) + foo_script(x, x) + foo_script(x, x) + out_s = foo_script(x, x) + out_eager = foo(x, x) + self.assertEqual(out_s, out_eager) + self.assertEqual(out_s.stride(), out_eager.stride()) + self.assertLastGraphAllFused() + def test_alias_analysis_module(self): class AliasModule(nn.Module): def __init__(self): diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 30a3a3e7fe68..f42983d9499c 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -7,6 +8,7 @@ #include #include #include +#include using namespace torch::jit; using namespace torch::jit::tensorexpr; @@ -1876,6 +1878,88 @@ std::vector TensorExprKernel::getReductionAxes( return axes; } +template +std::vector reverse_sort_indices(const std::vector& v) { + // initialize original index locations + std::vector idx(v.size()); + iota(idx.begin(), idx.end(), 0); + + std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) { + return v[i1] > v[i2]; + }); + return idx; +} + +bool denseAndNonOverlapping( + at::ArrayRef sizes, + at::ArrayRef strides) { + return (strides == at::infer_dense_strides(sizes, strides)); +} + +Tensor* TensorExprKernel::convertOutputToCorrectStrides(torch::jit::Value* v) { + const TensorTypePtr& tt = v->type()->expect(); + TORCH_INTERNAL_ASSERT(tensors_.count(v->unique())); + Tensor* tensor = tensors_[v->unique()]; + + TORCH_INTERNAL_ASSERT(tt->sizes().concrete_sizes()); + const auto sizes = *tt->sizes().concrete_sizes(); + std::vector default_strides = TensorType::contiguousStridesOf(sizes); + TORCH_INTERNAL_ASSERT(tt->strides().concrete_sizes()); + const std::vector strides = *tt->strides().concrete_sizes(); + // All Tensors in NNC are layed out in default, contiguous layout. + // If the output is also default contiguous we don't need to do anything + if (strides == default_strides) { + return tensor; + } + // If the tensor is not dense or overlaps, we have + // no way of matching the profiled striding + if (!denseAndNonOverlapping(sizes, strides)) { + return tensor; + } + + auto dims = dimsFromSizes(sizesForValue(v)); + // We need to convert the output tensor so that its values are layed + // so that whene viewed from the output strides the values are correct. + // A contiguous Tensor of size(2, 3) with values 0-5 is layed out as: + // [0] [1] [2] [3] [4] [5] + // The same valued tensor with strides (2, 1) would be layed out like + // [0] [3] [1] [4] [2] [5] + // When we are doing the re-ordering of values into the output tensor, + // we are iterating per-element of the input, ad we are fixed + // in indexing in to the output tensor at [i, j] = val + // `val` we want here is equal to the indices for the output + // tensor that would have given the same position as the output + // The position is equal to the sum of stride[i] * index[i], + // and we can can calculate the equivalent indices in the + // output tensor strides by iteratively computing the index of + // the biggest stride: + // absolute = ... + // for stride in strides_from_largest_to_smallest: + // cur_idx = absolute // stride + // absolute = absolute % stride + + return Compute( + "output_1", dims, [&](const std::vector& axes_input) { + std::vector axes(axes_input.begin(), axes_input.end()); + auto absolute_position = IntImm::make(0); + for (size_t i = 0; i < axes.size(); ++i) { + absolute_position = + absolute_position + (IntImm::make(default_strides[i]) * axes[i]); + } + std::vector sorted_stride_indices = + reverse_sort_indices(strides); + std::vector new_axes(sorted_stride_indices.size()); + for (size_t stride_index : sorted_stride_indices) { + auto stride = strides[stride_index]; + auto index = Div::make(absolute_position, IntImm::make(stride)); + absolute_position = + Mod::make(absolute_position, IntImm::make(stride)); + new_axes[stride_index] = index; + } + return tensor->call(new_axes); + }); +} + void TensorExprKernel::compile() { KernelScope kernelScope(&kernelArena_); GRAPH_DUMP("TensorExprKernel graph:", graph_); @@ -1910,15 +1994,24 @@ void TensorExprKernel::compile() { if (!tensors_.count(output->unique())) { throw malformed_input("cannot find output Tensor"); } - auto tensor_sizes = output->type()->expect()->sizes(); - std::vector size; - TORCH_INTERNAL_ASSERT( - tensor_sizes.sizes().has_value(), "Expected output size: ", output); - for (const auto& elem : *tensor_sizes.sizes()) { - TORCH_INTERNAL_ASSERT(elem, "expected all output values defined"); - size.push_back(*elem); + // The "strided" tensor will be incorrect if used in NNC, + // since NNC views it as contiguous. Only convert it to the right + // strides at the end of the kernel (if already contiguous it's a no-op) + Tensor* properly_strided_output = convertOutputToCorrectStrides(output); + tensors_[output->unique()] = properly_strided_output; + const auto& tt = output->type()->expect(); + auto sizes = *tt->sizes().concrete_sizes(); + tensorOutputSizes_.push_back(sizes); + auto strides = *tt->strides().concrete_sizes(); + + // If the tensor is not dense or overlaps, we have + // no way of matching the profiled striding + if (denseAndNonOverlapping(sizes, strides)) { + tensorOutputStrides_.push_back(*tt->strides().concrete_sizes()); + } else { + tensorOutputStrides_.push_back(TensorType::contiguousStridesOf(sizes)); } - tensorOutputSizes_.push_back(size); + tensorOutputs_.emplace_back(tensors_.at(output->unique())); tensorOutputTensorOptions_.push_back( c10::TensorOptions(tensorType(tensors_[output->unique()])) @@ -1992,8 +2085,11 @@ std::vector TensorExprKernel::prepareRunArgs( } for (size_t i = 0, e = tensorOutputs_.size(); i < e; ++i) { - outputs.push_back( - at::empty(tensorOutputSizes_[i], tensorOutputTensorOptions_[i])); + auto t = at::empty_strided( + tensorOutputSizes_[i], + tensorOutputStrides_[i], + tensorOutputTensorOptions_[i]); + outputs.push_back(t); runArgs.emplace_back(outputs.back().data_ptr()); } return runArgs; diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 7408060752ee..e3a7e9e32ca6 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -137,6 +137,8 @@ class TORCH_API TensorExprKernel { void bindInput(const torch::jit::Value* input); + Tensor* convertOutputToCorrectStrides(torch::jit::Value* v); + // Captures the information for reduction operation nodes. struct ReductionInfo { std::vector reductionDims; @@ -190,6 +192,7 @@ class TORCH_API TensorExprKernel { int64_t nInputs_ = 0; std::vector kernelArgs_; std::vector> tensorOutputSizes_; + std::vector> tensorOutputStrides_; std::vector tensorOutputTensorOptions_; std::vector tensorOutputs_; std::unordered_map tensors_; From 0c70585505a5621c9fafd35277361a317bd0b1bc Mon Sep 17 00:00:00 2001 From: Ryan Hileman Date: Thu, 10 Dec 2020 12:19:27 -0800 Subject: [PATCH 140/250] fix #49064 (invalid escape) by using raw strings (#49065) Summary: Fixes https://github.com/pytorch/pytorch/issues/49064 by using raw strings I removed `# noqa: W605` because that's the "invalid escape sequence" check: https://www.flake8rules.com/rules/W605.html I wrote a quick test to make sure the strings are the same before and after this PR. This block should print `True` (it does for me). ``` convolution_notes1 = \ {"groups_note": r"""* :attr:`groups` controls the connections between inputs and outputs. :attr:`in_channels` and :attr:`out_channels` must both be divisible by :attr:`groups`. For example, * At groups=1, all inputs are convolved to all outputs. * At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels and producing half the output channels, and both subsequently concatenated. * At groups= :attr:`in_channels`, each input channel is convolved with its own set of filters (of size :math:`\frac{\text{out\_channels}}{\text{in\_channels}}`).""", "depthwise_separable_note": r"""When `groups == in_channels` and `out_channels == K * in_channels`, where `K` is a positive integer, this operation is also known as a "depthwise convolution". In other words, for an input of size :math:`(N, C_{in}, L_{in})`, a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments :math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`."""} # noqa: B950 convolution_notes2 = \ {"groups_note": """* :attr:`groups` controls the connections between inputs and outputs. :attr:`in_channels` and :attr:`out_channels` must both be divisible by :attr:`groups`. For example, * At groups=1, all inputs are convolved to all outputs. * At groups=2, the operation becomes equivalent to having two conv layers side by side, each seeing half the input channels and producing half the output channels, and both subsequently concatenated. * At groups= :attr:`in_channels`, each input channel is convolved with its own set of filters (of size :math:`\\frac{\\text{out\_channels}}{\\text{in\_channels}}`).""", # noqa: W605 "depthwise_separable_note": """When `groups == in_channels` and `out_channels == K * in_channels`, where `K` is a positive integer, this operation is also known as a "depthwise convolution". In other words, for an input of size :math:`(N, C_{in}, L_{in})`, a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments :math:`(C_\\text{in}=C_\\text{in}, C_\\text{out}=C_\\text{in} \\times \\text{K}, ..., \\text{groups}=C_\\text{in})`."""} # noqa: W605,B950 print(convolution_notes1 == convolution_notes2) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/49065 Reviewed By: agolynski Differential Revision: D25464507 Pulled By: H-Huang fbshipit-source-id: 88a65a24e3cc29774af25e09823257b2136550fe --- torch/nn/modules/conv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index 33f2a84aed74..f22c35fa39ff 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -16,7 +16,7 @@ from typing import Optional, List, Tuple convolution_notes = \ - {"groups_note": """* :attr:`groups` controls the connections between inputs and outputs. + {"groups_note": r"""* :attr:`groups` controls the connections between inputs and outputs. :attr:`in_channels` and :attr:`out_channels` must both be divisible by :attr:`groups`. For example, @@ -27,14 +27,14 @@ concatenated. * At groups= :attr:`in_channels`, each input channel is convolved with its own set of filters (of size - :math:`\\frac{\\text{out\_channels}}{\\text{in\_channels}}`).""", # noqa: W605 + :math:`\frac{\text{out\_channels}}{\text{in\_channels}}`).""", - "depthwise_separable_note": """When `groups == in_channels` and `out_channels == K * in_channels`, + "depthwise_separable_note": r"""When `groups == in_channels` and `out_channels == K * in_channels`, where `K` is a positive integer, this operation is also known as a "depthwise convolution". In other words, for an input of size :math:`(N, C_{in}, L_{in})`, a depthwise convolution with a depthwise multiplier `K` can be performed with the arguments - :math:`(C_\\text{in}=C_\\text{in}, C_\\text{out}=C_\\text{in} \\times \\text{K}, ..., \\text{groups}=C_\\text{in})`."""} # noqa: W605,B950 + :math:`(C_\text{in}=C_\text{in}, C_\text{out}=C_\text{in} \times \text{K}, ..., \text{groups}=C_\text{in})`."""} # noqa: B950 From e1c1a7e96421735f1c6397de1e32e9986bd839d0 Mon Sep 17 00:00:00 2001 From: shubhambhokare1 Date: Thu, 10 Dec 2020 12:29:30 -0800 Subject: [PATCH 141/250] [ONNX] Changes to export API to better handle named arguments (#47367) Summary: The args parameter of ONNX export is changed to better support optional arguments such that args is represented as: args (tuple of arguments or torch.Tensor, a dictionary consisting of named arguments (optional)): a dictionary to specify the input to the corresponding named parameter: - KEY: str, named parameter - VALUE: corresponding input Pull Request resolved: https://github.com/pytorch/pytorch/pull/47367 Reviewed By: H-Huang Differential Revision: D25432691 Pulled By: bzinodev fbshipit-source-id: 9d4cba73cbf7bef256351f181f9ac5434b77eee8 --- docs/source/onnx.rst | 65 ++++++++++++ test/onnx/test_operators.py | 4 +- test/onnx/test_pytorch_onnx_onnxruntime.py | 109 ++++++++++++++++++++- torch/onnx/__init__.py | 64 ++++++++++-- torch/onnx/utils.py | 32 ++++++ 5 files changed, 258 insertions(+), 16 deletions(-) diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst index 9dc107d86267..49bbc1df45a0 100644 --- a/docs/source/onnx.rst +++ b/docs/source/onnx.rst @@ -274,6 +274,71 @@ In addition, Dropout layer need defined in init function so that inferencing can def forward(self, x): x = self.dropout(x) +Using dictionaries to handle Named Arguments as model inputs +------------------------------------------------------------ + +There are two ways to handle models which consist of named parameters or keyword arguments as inputs: + +* The first method is to pass all the inputs in the same order as required by the model and pass None + values for the keyword arguments that do not require a value to be passed + +* The second and more intuitive method is to represent the keyword arguments as key-value pairs where + the key represents the name of the argument in the model signature and the value represents the value + of the argument to be passed + +For example, in the model: :: + + class Model(torch.nn.Module): + def forward(self, x, y=None, z=None): + if y is not None: + return x + y + if z is not None: + return x + z + return x + m = Model() + x = torch.randn(2, 3) + z = torch.randn(2, 3) + +There are two ways of exporting the model: + +* Not using a dictionary for the keyword arguments and passing all the inputs in the same order + as required by the model :: + + torch.onnx.export(model, (x, None, z), ‘test.onnx’) + +* Using a dictionary to represent the keyword arguments. This dictionary is always passed in + addition to the non-keyword arguments and is always the last argument in the args tuple. :: + + torch.onnx.export(model, (x, {'y': None, 'z': z}), ‘test.onnx’) + +For cases in which there are no keyword arguments, models can be exported with either an +empty or no dictionary. For example, :: + + torch.onnx.export(model, (x, {}), ‘test.onnx’) + or + torch.onnx.export(model, (x, ), ‘test.onnx’) + +An exception to this rule are cases in which the last input is also of a dictionary type. +In these cases it is mandatory to have an empty dictionary as the last argument in the +args tuple. For example, :: + + class Model(torch.nn.Module): + def forward(self, k, x): + ... + return x + m = Model() + k = torch.randn(2, 3)   + x = {torch.tensor(1.): torch.randn(2, 3)} + +Without the presence of the empty dictionary, the export call assumes that the +‘x’ input is intended to represent the optional dictionary consisting of named arguments. +In order to prevent this from being an issue a constraint is placed to provide an empty +dictionary as the last input in the tuple args in such cases. +The new call would look like this. :: + + torch.onnx.export(model, (k, x, {}), ‘test.onnx’) + + Indexing -------- diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py index 8ccf0fdfdb89..f6fa533d7837 100644 --- a/test/onnx/test_operators.py +++ b/test/onnx/test_operators.py @@ -776,7 +776,7 @@ def forward(self, x_in): return x_out x = {torch.tensor(1.): torch.randn(1, 2, 3)} - self.assertONNX(MyModel(), (x,)) + self.assertONNX(MyModel(), (x, {})) def test_dict_str(self): class MyModel(torch.nn.Module): @@ -786,7 +786,7 @@ def forward(self, x_in): return x_out x = {"test_key_in": torch.randn(1, 2, 3)} - self.assertONNX(MyModel(), (x,)) + self.assertONNX(MyModel(), (x, {})) def test_arange_dynamic(self): class TestModel(torch.nn.Module): diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 61c0fd9dc384..c481d58e4bb5 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -81,14 +81,19 @@ def run_model_test(self, model, batch_size=2, state_dict=None, if input is None: input = torch.randn(batch_size, 3, 224, 224, requires_grad=True) - with torch.no_grad(): if isinstance(input, torch.Tensor): input = (input,) # In-place operators will update input tensor data as well. # Thus inputs are replicated before every forward call. - input_copy = copy.deepcopy(input) - output = model(*input_copy) + if isinstance(input, dict): + input = (input,) + input_args = copy.deepcopy(input) + input_kwargs = {} + if isinstance(input_args[-1], dict): + input_kwargs = input_args[-1] + input_args = input_args[:-1] + output = model(*input_args, **input_kwargs) if isinstance(output, torch.Tensor): output = (output,) @@ -459,7 +464,7 @@ def forward(self, x_in): return x_out x = {torch.tensor(1.): torch.randn(1, 2, 3)} - self.run_test(MyModel(), (x,)) + self.run_test(MyModel(), (x, {})) @disableScriptTest() def test_dict_str(self): @@ -470,7 +475,101 @@ def forward(self, x_in): return x_out x = {"test_key_in": torch.randn(1, 2, 3)} - self.run_test(MyModel(), (x,)) + self.run_test(MyModel(), (x, {})) + + def test_optional_inputs_with_no_optionals(self): + class NoOptionalModel(torch.nn.Module): + def forward(self, input): + return input + + # Without empty optional arguments dictionary + x = torch.randn(2, 3) + self.run_test(NoOptionalModel(), (x,)) + # With empty optional arguments dictionary + y = torch.randn(2, 3) + self.run_test(NoOptionalModel(), (y, {})) + + def test_optional_inputs_with_mixed_optionals(self): + class MixedModel(torch.nn.Module): + def forward(self, x, y=None, z=None): + if y is not None: + return x + y + if z is not None: + return x + z + return x + + x = torch.randn(2, 3) + y = torch.randn(2, 3) + z = torch.randn(2, 3) + # Without optional arguments dictionary + self.run_test(MixedModel(), (x, y, None)) + self.run_test(MixedModel(), (x, None, z)) + # With optional arguments dictionary + self.run_test(MixedModel(), (x, {'y': y, 'z': None})) + self.run_test(MixedModel(), (x, {'y': None, 'z': z})) + self.run_test(MixedModel(), (x, {'z': z})) + self.run_test(MixedModel(), (x, {'y': y})) + + def test_optional_inputs_with_all_optionals(self): + class AllOptionalModel(torch.nn.Module): + def forward(self, y=None, z=None): + if y is not None: + return y + if z is not None: + return z + + y = torch.randn(2, 3) + # Without optional arguments dictionary + self.run_test(AllOptionalModel(), (y, None)) + # With optional arguments dictionary + self.run_test(AllOptionalModel(), {'y': y, 'z': None}) + + def test_input_names_with_optional_args(self): + class NoOptionalModel(torch.nn.Module): + def forward(self, input): + return input + + # Without empty optional arguments dictionary + x = torch.randn(2, 3) + self.run_test(NoOptionalModel(), (x,), input_names=['input_x']) + # With empty optional arguments dictionary + y = torch.randn(2, 3) + self.run_test(NoOptionalModel(), (y, {})) + + class MixedModel(torch.nn.Module): + def forward(self, x, y=None, z=None): + if y is not None: + return x + y + if z is not None: + return x + z + return x + + x = torch.randn(2, 3) + y = torch.randn(2, 3) + z = torch.randn(2, 3) + # Without optional arguments dictionary + self.run_test(MixedModel(), (x, y, None), input_names=['input_x', 'input_y']) + self.run_test(MixedModel(), (x, None, z), input_names=['input_x', 'input_z']) + + # With optional arguments dictionary + self.run_test(MixedModel(), (x, {'y': y, 'z': None}), input_names=['input_x', 'input_y']) + self.run_test(MixedModel(), (x, {'y': None, 'z': z}), input_names=['input_x', 'input_z']) + + class AllOptionalModel(torch.nn.Module): + def forward(self, y=None, z=None): + if y is not None: + return y + if z is not None: + return z + + y = torch.randn(2, 3) + z = torch.randn(2, 3) + # Without optional arguments dictionary + self.run_test(AllOptionalModel(), (y, None), input_names=['input_y']) + self.run_test(AllOptionalModel(), (None, z), input_names=['input_z']) + # With optional arguments dictionary + self.run_test(AllOptionalModel(), {'y': y, 'z': None}, input_names=['input_y']) + self.run_test(AllOptionalModel(), {'y': None, 'z': z}, input_names=['input_z']) @disableScriptTest() def test_none_as_input(self): diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py index 67cb0e1f5dc2..d2d7e5591fb7 100644 --- a/torch/onnx/__init__.py +++ b/torch/onnx/__init__.py @@ -42,15 +42,61 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM Arguments: model (torch.nn.Module): the model to be exported. - args (tuple of arguments or torch.Tensor): the inputs to - the model, e.g., such that ``model(*args)`` is a valid - invocation of the model. Any non-Tensor arguments (including None) will - be hard-coded into the exported model; any Tensor arguments - will become inputs of the exported model, in the order they - occur in args. If args is a Tensor, this is equivalent - to having called it with a 1-ary tuple of that Tensor. - (Note: passing keyword arguments to the model is not currently - supported. Give us a shout if you need it.) + args (tuple of arguments or torch.Tensor, a dictionary consisting of named arguments (optional)): + a dictionary to specify the input to the corresponding named parameter: + - KEY: str, named parameter + - VALUE: corresponding input + args can be structured either as: + + 1. ONLY A TUPLE OF ARGUMENTS or torch.Tensor:: + + ‘’args = (x, y, z)’' + + The inputs to the model, e.g., such that ``model(*args)`` is a valid invocation + of the model. Any non-Tensor arguments will be hard-coded into the exported model; + any Tensor arguments will become inputs of the exported model, in the order they + occur in args. If args is a Tensor, this is equivalent to having + called it with a 1-ary tuple of that Tensor. + + 2. A TUPLE OF ARGUEMENTS WITH A DICTIONARY OF NAMED PARAMETERS:: + + ‘’args = (x, + { + ‘y’: input_y, + ‘z’: input_z + }) ‘’ + + The inputs to the model are structured as a tuple consisting of + non-keyword arguments and the last value of this tuple being a dictionary + consisting of named parameters and the corresponding inputs as key-value pairs. + If certain named argument is not present in the dictionary, it is assigned + the default value, or None if default value is not provided. + + Cases in which an dictionary input is the last input of the args tuple + would cause a conflict when a dictionary of named parameters is used. + The model below provides such an example. + + class Model(torch.nn.Module): + def forward(self, k, x): + ... + return x + + m = Model() + k = torch.randn(2, 3)   + x = {torch.tensor(1.): torch.randn(2, 3)} + + In the previous iteration, the call to export API would look like + + torch.onnx.export(model, (k, x), ‘test.onnx’) + + This would work as intended. However, the export function + would now assume that the ‘x’ input is intended to represent the optional + dictionary consisting of named arguments. In order to prevent this from being + an issue a constraint is placed to provide an empty dictionary as the last + input in the tuple args in such cases. The new call would look like this. + + torch.onnx.export(model, (k, x, {}), ‘test.onnx’) + f: a file-like object (has to implement fileno that returns a file descriptor) or a string containing a file name. A binary Protobuf will be written to this file. diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index 3fe19a56c124..479f874819f2 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -317,6 +317,35 @@ def _decide_external_data_format(use_external_data_format, operator_export_type, model_file_location = f if val_use_external_data_format and isinstance(f, str) else str() return val_use_external_data_format, model_file_location +def _decide_input_format(model, args): + import inspect + try: + sig = inspect.signature(model.forward) + ordered_list_keys = list(sig.parameters.keys()) + if isinstance(args[-1], dict): + args_dict = args[-1] + args = list(args)[:-1] + n_nonkeyword = len(args) + for optional_arg in ordered_list_keys[n_nonkeyword:]: + if optional_arg in args_dict: + args.append(args_dict[optional_arg]) + # Check if this arg has a default value + else: + param = sig.parameters[optional_arg] + if param.default is param.empty: + args.append(None) + else: + args.append(param.default) + args = tuple(args) + return args + # Cases of models without forward functions and dict inputs + except AttributeError: + warnings.warn("Model has no forward function") + return args + # Cases of models with no input args + except IndexError: + warnings.warn("No input args") + return args def _trace(func, args, operator_export_type, return_outs=False): # Special case for common case of passing a single Tensor @@ -514,6 +543,7 @@ def _export_to_pretty_string(model, args, f, export_params=True, verbose=False, opset_version) val_add_node_names = _decide_add_node_names(add_node_names, operator_export_type) val_do_constant_folding = _decide_constant_folding(do_constant_folding, operator_export_type, training) + args = _decide_input_format(model, args) graph, params_dict, torch_out = _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, example_outputs, _retain_param_name, @@ -564,6 +594,7 @@ def _find_missing_ops_onnx_export(model, args, f, verbose=False, training=Traini # in ONNX, fall through will occur and export the operator as is, as a custom ONNX op. operator_export_type = OperatorExportTypes.ONNX_FALLTHROUGH with select_model_mode_for_export(model, training): + args = _decide_input_format(model, args) graph, params_dict, torch_out = _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type) # The output 'unsupported_ops' will contain the names of all the ops that are not supported in ONNX @@ -629,6 +660,7 @@ def _export(model, args, f, export_params=True, verbose=False, training=None, val_use_external_data_format, model_file_location = _decide_external_data_format(use_external_data_format, operator_export_type, f) + args = _decide_input_format(model, args) if dynamic_axes is None: dynamic_axes = {} _validate_dynamic_axes(dynamic_axes, model, input_names, output_names) From 21dba8c1ad3fd1cb95baed61fb4f89d0d4a9ebdb Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Thu, 10 Dec 2020 12:51:13 -0800 Subject: [PATCH 142/250] Make aten::div.out c10-full (#47793) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47793 This migrates aten::div.out to be c10-full (without hacky wrapper) and fixes everything that needed to be fixed to make it work. This is a prerequisite step to making out ops c10-full. Diffs stacked on top of this will introduce a hacky_wrapper for out ops and use it to make more ops c10-full. ghstack-source-id: 118318433 Test Plan: waitforsandcastle Reviewed By: ezyang Differential Revision: D24901944 fbshipit-source-id: e477cb41675e477808c76af01706508beee44752 --- aten/src/ATen/native/BinaryOps.cpp | 4 ++-- aten/src/ATen/native/native_functions.yaml | 1 + aten/src/ATen/native/sparse/SparseTensorMath.cpp | 10 +++++----- tools/autograd/gen_trace_type.py | 12 +++++++++--- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index 0618bbf6260c..e8751be55387 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -147,7 +147,7 @@ Tensor& copysign_(Tensor& self, Scalar other) { return native::copysign_(self, wrapped_scalar_tensor(other)); } -Tensor& div_out(Tensor& result, const Tensor& self, const Tensor& other) { +Tensor& div_out(const Tensor& self, const Tensor& other, Tensor& result) { auto iter = TensorIterator::binary_float_op(result, self, other); div_stub(iter.device_type(), iter); return result; @@ -161,7 +161,7 @@ Tensor div(const Tensor& self, const Tensor& other) { } Tensor& div_(Tensor& self, const Tensor& other) { - return native::div_out(self, self, other); + return native::div_out(self, other, self); } // WARNING: There doesn't appear to be any testing for this function diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index dc6e815fb438..acb424fcc28a 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1485,6 +1485,7 @@ SparseCPU, SparseCUDA: div_sparse_ - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: full dispatch: CPU, CUDA: div_out SparseCPU, SparseCUDA: div_out_sparse_zerodim diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 3c836d0258d1..2bee0a581366 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -241,7 +241,7 @@ static SparseTensor& coalesce_(SparseTensor& tensor) { // values=[1., 1.] (after truncation), which sum to 2.f instead of 3.f. // To perform floor division the sparse tensor must be coalesced first. -SparseTensor& div_out_sparse_zerodim(SparseTensor& r, const SparseTensor& t, const Tensor& value) { +SparseTensor& div_out_sparse_zerodim(const SparseTensor& t, const Tensor& value, SparseTensor& r) { TORCH_CHECK(value.dim() == 0, "Sparse division requires a scalar or ", "zero-dim dense tensor divisor (got shape ", value.sizes(), " for divisor)"); TORCH_CHECK(!value.is_sparse(), "Sparse division requires a scalar or ", @@ -279,15 +279,15 @@ Tensor div_sparse(const Tensor& self, const Tensor& value) { commonDtype = typeMetaToScalarType(at::get_default_dtype()); } Tensor result = at::empty({0}, self.options().dtype(commonDtype)); - return div_out_sparse_zerodim(result, self, value); + return div_out_sparse_zerodim(self, value, result); } Tensor& div_sparse_(Tensor& self, const Tensor& value) { - return div_out_sparse_zerodim(self, self, value); + return div_out_sparse_zerodim(self, value, self); } SparseTensor& div_out_sparse_scalar(SparseTensor& r, const SparseTensor& t, Scalar value) { - return div_out_sparse_zerodim(r, t, wrapped_scalar_tensor(value)); + return div_out_sparse_zerodim(t, wrapped_scalar_tensor(value), r); } // -------------------------------------------------------------------- @@ -1108,7 +1108,7 @@ SparseTensor& _sspaddmm_out_cpu( "sspaddmm: Argument #1: Expected dim 1 size ", dim_k, ", got ", t.size(1)); int64_t nnz = sparse._nnz(); - // We have to make indices contiguous as we use indices.data_ptr in _to_csr which assumes row-contiguous storage + // We have to make indices contiguous as we use indices.data_ptr in _to_csr which assumes row-contiguous storage LongTensor indices = sparse._indices().contiguous(); Tensor values = sparse._values(); diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index e55402f9e68d..bd478b2de8d3 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -126,11 +126,17 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen args = [cpp_args.argument for cpp_args in sig_group.signature.arguments()] if f.func.is_out_fn(): - # *_out functions take the result as a first argument, but they are the - # last argument in the JIT schema. + # *_out functions take the result as a separate argument, but we don't want to + # trace that argument directly. Instead, we trace its TensorOptions. + # So first, we need to remove the out argument from the list of arguments to trace. # TODO: byte-for-byte compatible with old codegen behavior - it's incorrect to assume # there is only one output argument. - args = args[1:] + if f.use_c10_dispatcher.dispatcher_uses_new_style(): + # for c10-full ops, the out argument is in the end + args = args[:-1] + else: + # for legacy ops, the out argument is in the beginning. + args = args[1:] trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args) From c892c3ac9affb428d28696270f337007729ed914 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Thu, 10 Dec 2020 12:51:13 -0800 Subject: [PATCH 143/250] remove hacky_wrapper from BackendSelect (#49079) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49079 BackendSelect kernels have been changed to be written the new way, so this hacky_wrapper here isn't needed anymore. This PR is not expected to change perf or anything, just simplify the code a bit. The hacky_wrapper here was a no-op and not creating any actual wrappers because it short-cirtuits to not create a wrapper when there is no wrapper needed. ghstack-source-id: 118318436 Test Plan: waitforsandcastle Reviewed By: bhosmer Differential Revision: D25421633 fbshipit-source-id: 7a6125613f465dabed155dd892c8be6af5c617cf --- tools/codegen/gen.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index d101c78b67f7..36f3041fd617 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -801,12 +801,8 @@ def __call__(self, f: NativeFunction) -> Optional[str]: }} """ elif self.target is Target.REGISTRATION: - if local.use_c10_dispatcher() is UseC10Dispatcher.full: + if local.use_c10_dispatcher().dispatcher_uses_new_style(): return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" - elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: - return f"""m.impl("aten::{f.func.name}", - c10::impl::hacky_wrapper_for_legacy_signatures<{dispatcher_sig.type()}>( - TORCH_FN({name})));""" else: assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});""" From a480ca53028658ec32ee18183a40cda60304663a Mon Sep 17 00:00:00 2001 From: Meghan Lele Date: Thu, 10 Dec 2020 13:09:16 -0800 Subject: [PATCH 144/250] [JIT] Use `is_buffer` in `BufferPolicy::valid` (#49053) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49053 **Summary** `BufferPolicy::valid` uses `!typ->is_parameter(i)` to check if an attribute is a buffer or not; it should use `type->is_buffer(i)` instead. **Test Plan** It is difficult to write an additional test that would have failed before this commit because the two booleans `is_parameter` and `is_buffer` are never set to `true` at the same time. **Fixes** This commit fixes #48746. Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D25434956 Pulled By: SplitInfinity fbshipit-source-id: ff2229058abbafed0b67d7b26254d406e5f7b074 --- torch/csrc/jit/api/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h index 4b68a85c6696..1d96931f25fe 100644 --- a/torch/csrc/jit/api/module.h +++ b/torch/csrc/jit/api/module.h @@ -508,7 +508,7 @@ struct TORCH_API BufferPolicy { } static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) { return typ->getAttribute(i)->isSubtypeOf(TensorType::get()) && - !typ->is_parameter(i); + typ->is_buffer(i); } static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false; }; From a3e1bd1fb9ad1dbc8506068d206969ca08872404 Mon Sep 17 00:00:00 2001 From: Zino Benaissa Date: Thu, 10 Dec 2020 13:34:36 -0800 Subject: [PATCH 145/250] Preserve submodule with __set_state__ in freezing (#47308) Summary: This PR does the following: - fail freezing if input module has __set_state__ method - preserves attributes of submodules with __set_state__ method. Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/47308 Reviewed By: eellison Differential Revision: D24711613 Pulled By: bzinodev fbshipit-source-id: 22e51417454aaf85cc0ae4acb2dc7fc822f149a2 --- test/jit/test_freezing.py | 94 +++++++++++++++++++++++++ torch/csrc/jit/passes/freeze_module.cpp | 20 +++++- 2 files changed, 112 insertions(+), 2 deletions(-) diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 7b7490e80ec5..6066835a5eaa 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -140,6 +140,100 @@ def forward(self, x): output_f = mf.forward(input) self.assertEqual(output_s, output_f) + def test_freeze_module_with_setstate(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.tensor = torch.randn(2, 2) + + @torch.jit.export + def __getstate__(self): + return (self.tensor, self.training) + + @torch.jit.export + def __setstate__(self, state): + self.tensor = 2 * state[0] + self.training = state[1] + + def forward(self, x): + return x + self.tensor + + m = torch.jit.script(M()) + m.eval() + with self.assertRaisesRegex(RuntimeError, "cannot freeze a module that has __set_state__"): + mf = torch.jit.freeze(m) + + def test_freeze_module_with_submodule_setstate(self): + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + self.tensor = torch.randn(2, 2) + + @torch.jit.export + def __getstate__(self): + return (self.tensor, self.training) + + @torch.jit.export + def __setstate__(self, state): + self.tensor = 2 * state[0] + self.training = state[1] + + def forward(self, x): + return x + self.tensor + + class TestModule(nn.Module): + def __init__(self): + super(TestModule, self).__init__() + self.sub = M() + self.a = torch.randn(2, 2) + self.b = 4 + + def forward(self, x): + return self.sub(x) + self.a + + m = torch.jit.script(TestModule()) + m.eval() + input = torch.randn(2, 2) + output_s = m.forward(input) + mf = torch.jit.freeze(m) + + output_f = mf.forward(input) + buffer = io.BytesIO() + torch.jit.save(mf._c, buffer) + buffer.seek(0) + loaded = torch.jit.load(buffer) + output_l = loaded.forward(input) + + # Check if frozen module looks as below: + # module m { + # attributes { + # sub = ... + # } + # ... + # submodule { + # module m { + # attributes { + # training = + # tensor = ... + # } + # ... + # } + # } + # } + mf = mf._c + self.assertFalse(mf.hasattr('a')) + self.assertTrue(mf.hasattr('sub')) + self.assertTrue(mf.sub.hasattr('tensor')) + self.assertTrue(mf.sub.hasattr('training')) + + # __setstate__ is executed cloning the module for freezing + self.assertEqual(mf.sub.tensor, 2 * m.sub.tensor) + self.assertEqual(output_s + m.sub.tensor , output_f) + + # __setstate__ is execuded loading frozen module + self.assertEqual(loaded.sub.tensor, 2 * mf.sub.tensor) + self.assertEqual(output_l, mf.sub.tensor + output_f) + def test_freeze_module_with_fork(self): class SubModule(nn.Module): def __init__(self): diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 2778c7712f23..76ca595c3283 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -241,11 +241,20 @@ class AttributePropagator { } auto attr = attrModule.attr(name); + auto mptr = attrModule._ivalue(); if (n->kind() == prim::GetAttr) { auto type = n->output()->type(); // Do not record submodules. Their attributes are tracked // individually. - if (attr.isObject() || !AliasDb::isMutableType(attr.type())) { + if (attr.isObject()) { + auto submodule = attr.toModule(); + if (submodule.find_method("__setstate__")) { + insertMutableAttr(name, attr, mptr); + } + continue; + } + + if (!AliasDb::isMutableType(attr.type())) { continue; } usedAttrs_.insert(attr); @@ -256,7 +265,6 @@ class AttributePropagator { n->kind() == prim::GetAttr ? "attribute: " + name + " in %" + n->output()->debugName() + " has inplace writer" : "attribute: " + name + " is set"); - auto mptr = attrModule._ivalue(); insertMutableAttr(name, attr, mptr); } } else if (n->kind() == prim::fork) { @@ -525,6 +533,11 @@ class AttributePropagator { return true; } } + + if (subModule.find_method("__setstate__")) { + return true; + } + return preservedSubModule_.count(subModule._ivalue()); } @@ -751,6 +764,9 @@ Module freeze_module( std::vector preservedAttrs, bool freezeInterfaces, bool preserveParameters) { + TORCH_CHECK( + !module.find_method("__setstate__"), + "cannot freeze a module that has __set_state__"); Method method = module.get_method("forward"); // Check that module does not return itself. for (auto& output : method.graph()->outputs()) { From 95a1725a4a524d215817224237f3bb4a5ff9b84f Mon Sep 17 00:00:00 2001 From: Abdelrauf Date: Thu, 10 Dec 2020 13:35:02 -0800 Subject: [PATCH 146/250] Vsx initial support issue27678 (#41541) Summary: ### Pytorch Vec256 ppc64le support implemented types: - double - float - int16 - int32 - int64 - qint32 - qint8 - quint8 - complex_float - complex_double Notes: All basic vector operations are implemented: There are a few problems: - minimum maximum nan propagation for ppc64le is missing and was not checked - complex multiplication, division, sqrt, abs are implemented as PyTorch x86. they can overflow and have precision problems than std ones. That's why they were either excluded or tested in smaller domain range - precisions of the implemented float math functions ~~Besides, I added CPU_CAPABILITY for power. but as because of quantization errors for DEFAULT I had to undef and use vsx for DEFAULT too~~ #### Details ##### Supported math functions + plus sign means vectorized, - minus sign means missing, (implementation notes are added inside braces) (notes). Example: -(both ) means it was also missing on x86 side g( func_name) means vectorization is using func_name sleef - redirected to the Sleef unsupported function_name | float | double | complex float | complex double |-- | -- | -- | -- | --| acos | sleef | sleef | f(asin) | f(asin) asin | sleef | sleef | +(pytorch impl) | +(pytorch impl) atan | sleef | sleef | f(log) | f(log) atan2 | sleef | sleef | unsupported | unsupported cos | +((ppc64le:avx_mathfun) ) | sleef | -(both) | -(both) cosh | f(exp) | -(both) | -(both) | erf | sleef | sleef | unsupported | unsupported erfc | sleef | sleef | unsupported | unsupported erfinv | - (both) | - (both) | unsupported | unsupported exp | + | sleef | - (x86:f()) | - (x86:f()) expm1 | f(exp) | sleef | unsupported | unsupported lgamma | sleef | sleef | | log | + | sleef | -(both) | -(both) log10 | f(log) | sleef | f(log) | f(log) log1p | f(log) | sleef | unsupported | unsupported log2 | f(log) | sleef | f(log) | f(log) pow | + f(exp) | sleef | -(both) | -(both) sin | +((ppc64le:avx_mathfun) ) | sleef | -(both) | -(both) sinh | f(exp) | sleef | -(both) | -(both) tan | sleef | sleef | -(both) | -(both) tanh | f(exp) | sleef | -(both) | -(both) hypot | sleef | sleef | -(both) | -(both) nextafter | sleef | sleef | -(both) | -(both) fmod | sleef | sleef | -(both) | -(both) [Vec256 Test cases Pr https://github.com/pytorch/pytorch/issues/42685](https://github.com/pytorch/pytorch/pull/42685) Current list: - [x] Blends - [x] Memory: UnAlignedLoadStore - [x] Arithmetics: Plus,Minu,Multiplication,Division - [x] Bitwise: BitAnd, BitOr, BitXor - [x] Comparison: Equal, NotEqual, Greater, Less, GreaterEqual, LessEqual - [x] MinMax: Minimum, Maximum, ClampMin, ClampMax, Clamp - [x] SignManipulation: Absolute, Negate - [x] Interleave: Interleave, DeInterleave - [x] Rounding: Round, Ceil, Floor, Trunc - [x] Mask: ZeroMask - [x] SqrtAndReciprocal: Sqrt, RSqrt, Reciprocal - [x] Trigonometric: Sin, Cos, Tan - [x] Hyperbolic: Tanh, Sinh, Cosh - [x] InverseTrigonometric: Asin, ACos, ATan, ATan2 - [x] Logarithm: Log, Log2, Log10, Log1p - [x] Exponents: Exp, Expm1 - [x] ErrorFunctions: Erf, Erfc, Erfinv - [x] Pow: Pow - [x] LGamma: LGamma - [x] Quantization: quantize, dequantize, requantize_from_int - [x] Quantization: widening_subtract, relu, relu6 Missing: - [ ] Constructors, initializations - [ ] Conversion , Cast - [ ] Additional: imag, conj, angle (note: imag and conj only checked for float complex) #### Notes on tests and testing framework - some math functions are tested within domain range - mostly testing framework randomly tests against std implementation within the domain or within the implementation domain for some math functions. - some functions are tested against the local version. ~~For example, std::round and vector version of round differs. so it was tested against the local version~~ - round was tested against pytorch at::native::round_impl. ~~for double type on **Vsx vec_round failed for (even)+0 .5 values**~~ . it was solved by using vec_rint - ~~**complex types are not tested**~~ **After enabling complex testing due to precision and domain some of the complex functions failed for vsx and x86 avx as well. I will either test it against local implementation or check within the accepted domain** - ~~quantizations are not tested~~ Added tests for quantizing, dequantize, requantize_from_int, relu, relu6, widening_subtract functions - the testing framework should be improved further - ~~For now `-DBUILD_MOBILE_TEST=ON `will be used for Vec256Test too~~ Vec256 Test cases will be built for each CPU_CAPABILITY Pull Request resolved: https://github.com/pytorch/pytorch/pull/41541 Reviewed By: zhangguanheng66 Differential Revision: D23922049 Pulled By: VitalyFedyunin fbshipit-source-id: bca25110afccecbb362cea57c705f3ce02f26098 --- aten/src/ATen/Version.cpp | 9 + aten/src/ATen/cpu/vec256/vec256.h | 4 + .../ATen/cpu/vec256/vsx/vec256_common_vsx.h | 216 ++++++ .../vec256/vsx/vec256_complex_double_vsx.h | 597 ++++++++++++++++ .../cpu/vec256/vsx/vec256_complex_float_vsx.h | 670 +++++++++++++++++ .../ATen/cpu/vec256/vsx/vec256_double_vsx.h | 392 ++++++++++ .../ATen/cpu/vec256/vsx/vec256_float_vsx.h | 676 ++++++++++++++++++ .../ATen/cpu/vec256/vsx/vec256_int16_vsx.h | 351 +++++++++ .../ATen/cpu/vec256/vsx/vec256_int32_vsx.h | 281 ++++++++ .../ATen/cpu/vec256/vsx/vec256_int64_vsx.h | 233 ++++++ .../ATen/cpu/vec256/vsx/vec256_qint32_vsx.h | 242 +++++++ .../ATen/cpu/vec256/vsx/vec256_qint8_vsx.h | 404 +++++++++++ .../ATen/cpu/vec256/vsx/vec256_quint8_vsx.h | 413 +++++++++++ aten/src/ATen/cpu/vec256/vsx/vsx_helpers.h | 332 +++++++++ aten/src/ATen/native/DispatchStub.cpp | 10 + aten/src/ATen/native/DispatchStub.h | 22 +- cmake/Codegen.cmake | 6 + cmake/Dependencies.cmake | 1 + cmake/Modules/FindVSX.cmake | 35 + 19 files changed, 4893 insertions(+), 1 deletion(-) create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_common_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_float_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_int16_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_int32_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_int64_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h create mode 100644 aten/src/ATen/cpu/vec256/vsx/vsx_helpers.h create mode 100644 cmake/Modules/FindVSX.cmake diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index 192e131897c8..6b9561767a5f 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -97,6 +97,14 @@ std::string used_cpu_capability() { ss << "CPU capability usage: "; auto capability = native::get_cpu_capability(); switch (capability) { +#ifdef HAVE_VSX_CPU_DEFINITION + case native::CPUCapability::DEFAULT: + ss << "DEFAULT"; + break; + case native::CPUCapability::VSX: + ss << "VSX"; + break; +#else case native::CPUCapability::DEFAULT: ss << "NO AVX"; break; @@ -106,6 +114,7 @@ std::string used_cpu_capability() { case native::CPUCapability::AVX2: ss << "AVX2"; break; +#endif default: break; } diff --git a/aten/src/ATen/cpu/vec256/vec256.h b/aten/src/ATen/cpu/vec256/vec256.h index 96d17a9e1afa..ae40b9a5b4fd 100644 --- a/aten/src/ATen/cpu/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec256/vec256.h @@ -6,6 +6,7 @@ #include #include +#if !defined(__VSX__) || !defined(CPU_CAPABILITY_VSX) #include #include #include @@ -14,6 +15,9 @@ #include #include #include +#else +#include +#endif #include #include diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_common_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_common_vsx.h new file mode 100644 index 000000000000..516179932d34 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_common_vsx.h @@ -0,0 +1,216 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace at { +namespace vec256 { + +namespace { + +DEFINE_CLAMP_FUNCS(c10::quint8) +DEFINE_CLAMP_FUNCS(c10::qint8) +DEFINE_CLAMP_FUNCS(c10::qint32) +DEFINE_CLAMP_FUNCS(int16_t) +DEFINE_CLAMP_FUNCS(int32_t) +DEFINE_CLAMP_FUNCS(int64_t) +DEFINE_CLAMP_FUNCS(float) +DEFINE_CLAMP_FUNCS(double) + +template <> +Vec256 C10_ALWAYS_INLINE fmadd( + const Vec256& a, + const Vec256& b, + const Vec256& c) { + return Vec256{ + vec_madd(a.vec0(), b.vec0(), c.vec0()), + vec_madd(a.vec1(), b.vec1(), c.vec1())}; +} + +template <> +Vec256 C10_ALWAYS_INLINE fmadd( + const Vec256& a, + const Vec256& b, + const Vec256& c) { + return Vec256{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} +template <> +Vec256 C10_ALWAYS_INLINE fmadd( + const Vec256& a, + const Vec256& b, + const Vec256& c) { + return Vec256{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} +template <> +Vec256 C10_ALWAYS_INLINE fmadd( + const Vec256& a, + const Vec256& b, + const Vec256& c) { + return Vec256{ + a.vec0() * b.vec0() + c.vec0(), a.vec1() * b.vec1() + c.vec1()}; +} + +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(float) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(double) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int64_t) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int32_t) +DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(int16_t) + +template <> +Vec256 C10_ALWAYS_INLINE +convert_to_int_of_same_size(const Vec256& src) { + return Vec256{vec_signed(src.vec0()), vec_signed(src.vec1())}; +} + +template <> +Vec256 C10_ALWAYS_INLINE +convert_to_int_of_same_size( + const Vec256& src) { + return Vec256{vec_signed(src.vec0()), vec_signed(src.vec1())}; +} + +template <> +inline void convert(const int32_t* src, float* dst, int64_t n) { + // int32_t and float have same size + int64_t i; + for (i = 0; i <= (n - Vec256::size()); i += Vec256::size()) { + const int32_t* src_a = src + i; + float* dst_a = dst + i; + vint32 input_vec0 = vec_vsx_ld(offset0, reinterpret_cast(src_a)); + vint32 input_vec1 = + vec_vsx_ld(offset16, reinterpret_cast(src_a)); + vfloat32 c0 = vec_float(input_vec0); + vfloat32 c1 = vec_float(input_vec1); + vec_vsx_st(c0, offset0, dst_a); + vec_vsx_st(c1, offset16, dst_a); + } + + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int64_t* src, double* dst, int64_t n) { + int64_t i; + for (i = 0; i <= (n - Vec256::size()); i += Vec256::size()) { + const int64_t* src_a = src + i; + double* dst_a = dst + i; + vint64 input_vec0 = + vec_vsx_ld(offset0, reinterpret_cast(src_a)); + vint64 input_vec1 = + vec_vsx_ld(offset16, reinterpret_cast(src_a)); + vfloat64 c0 = vec_double(input_vec0); + vfloat64 c1 = vec_double(input_vec1); + vec_vsx_st(c0, offset0, reinterpret_cast(dst_a)); + vec_vsx_st(c1, offset16, reinterpret_cast(dst_a)); + } + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +std::pair, Vec256> inline interleave2( + const Vec256& a, + const Vec256& b) { + // inputs: + // a = {a0, a1, a2, a3} + // b = {b0, b1, b2, b3} + + vfloat64 ab00 = vec_xxpermdi(a.vec0(), b.vec0(), 0); + vfloat64 ab11 = vec_xxpermdi(a.vec0(), b.vec0(), 3); + vfloat64 ab2_00 = vec_xxpermdi(a.vec1(), b.vec1(), 0); + vfloat64 ab2_11 = vec_xxpermdi(a.vec1(), b.vec1(), 3); + // return {a0, b0, a1, b1} + // {a2, b2, a3, b3} + return std::make_pair( + Vec256{ab00, ab11}, Vec256{ab2_00, ab2_11}); +} + +template <> +std::pair, Vec256> inline deinterleave2( + const Vec256& a, + const Vec256& b) { + // inputs: + // a = {a0, b0, a1, b1} + // b = {a2, b2, a3, b3} + vfloat64 aa01 = vec_xxpermdi(a.vec0(), a.vec1(), 0); + vfloat64 aa23 = vec_xxpermdi(b.vec0(), b.vec1(), 0); + + vfloat64 bb_01 = vec_xxpermdi(a.vec0(), a.vec1(), 3); + vfloat64 bb_23 = vec_xxpermdi(b.vec0(), b.vec1(), 3); + + // swap lanes: + // return {a0, a1, a2, a3} + // {b0, b1, b2, b3} + return std::make_pair( + Vec256{aa01, aa23}, Vec256{bb_01, bb_23}); +} + +template <> +std::pair, Vec256> inline interleave2( + const Vec256& a, + const Vec256& b) { + // inputs: + // a = {a0, a1, a2, a3,, a4, a5, a6, a7} + // b = {b0, b1, b2, b3,, b4, b5, b6, b7} + + vfloat32 ab0011 = vec_mergeh(a.vec0(), b.vec0()); + vfloat32 ab2233 = vec_mergel(a.vec0(), b.vec0()); + + vfloat32 ab2_0011 = vec_mergeh(a.vec1(), b.vec1()); + vfloat32 ab2_2233 = vec_mergel(a.vec1(), b.vec1()); + // group cols crossing lanes: + // return {a0, b0, a1, b1,, a2, b2, a3, b3} + // {a4, b4, a5, b5,, a6, b6, a7, b7} + + return std::make_pair( + Vec256{ab0011, ab2233}, Vec256{ab2_0011, ab2_2233}); +} + +template <> +std::pair, Vec256> inline deinterleave2( + const Vec256& a, + const Vec256& b) { + // inputs: + // a = {a0, b0, a1, b1,, a2, b2, a3, b3} + // b = {a4, b4, a5, b5,, a6, b6, a7, b7} + + // {a0,a2,b0,b2} {a1,a3,b1,b3} + vfloat32 a0a2b0b2 = vec_mergeh(a.vec0(), a.vec1()); + vfloat32 a1a3b1b3 = vec_mergel(a.vec0(), a.vec1()); + + vfloat32 aa0123 = vec_mergeh(a0a2b0b2, a1a3b1b3); + vfloat32 bb0123 = vec_mergel(a0a2b0b2, a1a3b1b3); + + vfloat32 a0a2b0b2_2 = vec_mergeh(b.vec0(), b.vec1()); + vfloat32 a1a3b1b3_2 = vec_mergel(b.vec0(), b.vec1()); + + vfloat32 aa0123_2 = vec_mergeh(a0a2b0b2_2, a1a3b1b3_2); + vfloat32 bb0123_2 = vec_mergel(a0a2b0b2_2, a1a3b1b3_2); + + // it could be done with vec_perm ,too + // swap lanes: + // return {a0, a1, a2, a3,, a4, a5, a6, a7} + // {b0, b1, b2, b3,, b4, b5, b6, b7} + + return std::make_pair( + Vec256{aa0123, aa0123_2}, Vec256{bb0123, bb0123_2}); +} + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h new file mode 100644 index 000000000000..f62ac36850be --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_double_vsx.h @@ -0,0 +1,597 @@ +#pragma once +#include +#include +#include +#include + +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] +namespace { +using ComplexDbl = c10::complex; + +template <> +class Vec256 { + union { + struct { + vfloat64 _vec0; + vfloat64 _vec1; + }; + struct { + vbool64 _vecb0; + vbool64 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = ComplexDbl; + using vec_internal_type = vfloat64; + using vec_internal_mask_type = vbool64; + static constexpr int size() { + return 2; + } + Vec256() {} + C10_ALWAYS_INLINE Vec256(vfloat64 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {} + + Vec256(ComplexDbl val) { + double real_value = val.real(); + double imag_value = val.imag(); + _vec0 = vfloat64{real_value, imag_value}; + _vec1 = vfloat64{real_value, imag_value}; + } + Vec256(ComplexDbl val1, ComplexDbl val2) { + _vec0 = vfloat64{val1.real(), val1.imag()}; + _vec1 = vfloat64{val2.real(), val2.imag()}; + } + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {a._vec0, b._vec1}; + } + + template + static Vec256 C10_ALWAYS_INLINE + el_blend(const Vec256& a, const Vec256& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + const vbool64 mask_2nd = VsxDblMask2(mask); + return { + (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vec256 blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // convert std::complex index mask to V index mask: xy -> xxyy + auto mask_complex = + Vec256(vec_splat(mask._vec0, 0), vec_splat(mask._vec1, 0)); + return { + vec_sel(a._vec0, b._vec0, mask_complex._vecb0), + vec_sel(a._vec1, b._vec1, mask_complex._vecb1)}; + } + + static Vec256 C10_ALWAYS_INLINE elwise_blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + template + static Vec256 arange( + ComplexDbl base = 0., + step_t step = static_cast(1)) { + return Vec256(base, base + step); + } + static Vec256 set( + const Vec256& a, + const Vec256& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + } + return b; + } + + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return { + vec_vsx_ld(offset0, reinterpret_cast(tmp_values)), + vec_vsx_ld(offset16, reinterpret_cast(tmp_values))}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, reinterpret_cast(tmp_values)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(tmp_values)); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + const ComplexDbl& operator[](int idx) const = delete; + ComplexDbl& operator[](int idx) = delete; + + Vec256 map(ComplexDbl (*f)(ComplexDbl)) const { + __at_align32__ ComplexDbl tmp[size()]; + store(tmp); + for (int i = 0; i < size(); i++) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + Vec256 map(ComplexDbl (*f)(const ComplexDbl&)) const { + __at_align32__ ComplexDbl tmp[size()]; + store(tmp); + for (int i = 0; i < size(); i++) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + Vec256 el_swapped() const { + vfloat64 v0 = vec_xxpermdi(_vec0, _vec0, 2); + vfloat64 v1 = vec_xxpermdi(_vec1, _vec1, 2); + return {v0, v1}; + } + + Vec256 el_madd( + const Vec256& multiplier, + const Vec256& val) const { + return { + vec_madd(_vec0, multiplier._vec0, val._vec0), + vec_madd(_vec1, multiplier._vec1, val._vec1)}; + } + + Vec256 el_mergeo() const { + vfloat64 v0 = vec_splat(_vec0, 1); + vfloat64 v1 = vec_splat(_vec1, 1); + return {v0, v1}; + } + + Vec256 el_mergee() const { + vfloat64 v0 = vec_splat(_vec0, 0); + vfloat64 v1 = vec_splat(_vec1, 0); + return {v0, v1}; + } + + static Vec256 el_mergee( + Vec256& first, + Vec256& second) { + // as mergee phased in , we can use vec_perm with mask + return { + vec_mergeh(first._vec0, second._vec0), + vec_mergeh(first._vec1, second._vec1)}; + } + + Vec256 abs_2_() const { + auto a = (*this).elwise_mult(*this); + auto permuted = a.el_swapped(); + a = a + permuted; + return a; + } + + Vec256 abs_() const { + auto ret = abs_2_(); + return ret.elwise_sqrt(); + } + + Vec256 abs() const { + return abs_() & vd_real_mask; + } + + Vec256 angle_() const { + // angle = atan2(b/a) + // auto b_a = _mm256_permute_pd(values, 0x05); // b a + // return Sleef_atan2d4_u10(values, b_a); // 90-angle angle + auto ret = el_swapped(); + for (int i = 0; i < 2; i++) { + ret._vec0[i] = std::atan2(_vec0[i], ret._vec0[i]); + ret._vec1[i] = std::atan2(_vec1[i], ret._vec0[i]); + } + return ret; + } + + Vec256 angle() const { + auto a = angle_().el_swapped(); + return a & vd_real_mask; + } + + Vec256 real_() const { + return *this & vd_real_mask; + } + Vec256 real() const { + return *this & vd_real_mask; + } + Vec256 imag_() const { + return *this & vd_imag_mask; + } + Vec256 imag() const { + return imag_().el_swapped(); + } + + Vec256 conj_() const { + return *this ^ vd_isign_mask; + } + Vec256 conj() const { + return *this ^ vd_isign_mask; + } + + Vec256 log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + + Vec256 log2() const { + // log2eB_inv + auto ret = log(); + return ret.elwise_mult(vd_log2e_inv); + } + Vec256 log10() const { + auto ret = log(); + return ret.elwise_mult(vd_log10e_inv); + } + + Vec256 asin() const { + // asin(x) + // = -i*ln(iz + sqrt(1 -z^2)) + // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + auto conj = conj_(); + auto b_a = conj.el_swapped(); + auto ab = conj.elwise_mult(b_a); + auto im = ab + ab; + auto val_2 = (*this).elwise_mult(*this); + auto val_2_swapped = val_2.el_swapped(); + auto re = horizontal_sub(val_2, val_2_swapped); + re = Vec256(vd_one) - re; + auto root = el_blend<0x0A>(re, im).sqrt(); + auto ln = (b_a + root).log(); + return ln.el_swapped().conj(); + } + + Vec256 acos() const { + // acos(x) = pi/2 - asin(x) + return Vec256(vd_pi_2) - asin(); + } + + Vec256 atan() const { + // atan(x) = i/2 * ln((i + z)/(i - z)) + auto ione = Vec256(vd_imag_one); + auto sum = ione + *this; + auto sub = ione - *this; + auto ln = (sum / sub).log(); // ln((i + z)/(i - z)) + return ln * vd_imag_half; // i/2*ln() + } + + Vec256 sin() const { + return map(std::sin); + } + Vec256 sinh() const { + return map(std::sinh); + } + Vec256 cos() const { + return map(std::cos); + } + Vec256 cosh() const { + return map(std::cosh); + } + + Vec256 tan() const { + return map(std::tan); + } + Vec256 tanh() const { + return map(std::tanh); + } + Vec256 ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vec256 floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vec256 neg() const { + auto z = Vec256(vd_zero); + return z - *this; + } + Vec256 round() const { + return {vec_rint(_vec0), vec_rint(_vec1)}; + } + + Vec256 trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vec256 elwise_sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + + void dump() const { + std::cout << _vec0[0] << "," << _vec0[1] << ","; + std::cout << _vec1[0] << "," << _vec1[1] << std::endl; + } + + Vec256 sqrt() const { + return map(std::sqrt); + } + + Vec256 reciprocal() const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() = c/abs_2() + // im = (bc - ad)/abs_2() = d/abs_2() + auto c_d = *this ^ vd_isign_mask; // c -d + auto abs = abs_2_(); + return c_d.elwise_div(abs); + } + + Vec256 rsqrt() const { + return sqrt().reciprocal(); + } + + static Vec256 horizontal_add( + Vec256& first, + Vec256& second) { + auto first_perm = first.el_swapped(); // 2perm + auto second_perm = second.el_swapped(); // 2perm + // summ + auto first_ret = first + first_perm; // 2add + auto second_ret = second + second_perm; // 2 add + // now lets choose evens + return el_mergee(first_ret, second_ret); // 2 mergee's + } + + static Vec256 horizontal_sub( + Vec256& first, + Vec256& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.el_swapped(); // 2perm + auto second_perm = second.el_swapped(); // 2perm + // summ + auto first_ret = first - first_perm; // 2sub + auto second_ret = second - second_perm; // 2 sub + // now lets choose evens + return el_mergee(first_ret, second_ret); // 2 mergee's + } + + Vec256 inline operator*(const Vec256& b) const { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i +#if 1 + // this is more vsx friendly than simulating horizontal from x86 + auto vi = b.el_mergeo(); + auto vr = b.el_mergee(); + vi = vi ^ vd_rsign_mask; + auto ret = elwise_mult(vr); + auto vx_swapped = el_swapped(); + ret = vx_swapped.el_madd(vi, ret); +#else + auto ac_bd = elwise_mult(b); + auto d_c = b.el_swapped(); + d_c = d_c ^ vd_isign_mask; + auto ad_bc = elwise_mult(d_c); + auto ret = horizontal_sub(ac_bd, ad_bc); +#endif + return ret; + } + + Vec256 inline operator/(const Vec256& b) const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() + // im = (bc - ad)/abs_2() +#if 1 + auto vi = b.el_mergeo(); + auto vr = b.el_mergee(); + auto abs_b = b.abs_2_(); + vi = vi ^ vd_isign_mask; + auto ret = elwise_mult(vr); + auto vx_swapped = el_swapped(); + ret = vx_swapped.el_madd(vi, ret); + ret = ret.elwise_div(abs_b); +#else + // Vec256 x86 simulation + auto ac_bd = elwise_mult(b); + auto d_c = b.el_swapped(); + d_c = d_c ^ vd_rsign_mask; + auto ad_bc = elwise_mult(d_c); + auto abs_b = b.abs_2_(); + auto re_im = horizontal_add(ac_bd, ad_bc); + auto ret = re_im.elwise_div(abs_b); +#endif + return ret; + } + + Vec256 exp() const { + return map(std::exp); + } + + Vec256 pow(const Vec256& exp) const { + __at_align32__ ComplexDbl x_tmp[size()]; + __at_align32__ ComplexDbl y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (int i = 0; i < size(); i++) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + + Vec256 sgn() const { + return map(at::native::sgn_impl); + } + + Vec256 hypot(const Vec256& b) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 nextafter(const Vec256& b) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 igamma(const Vec256& x) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 igammac(const Vec256& x) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 log1p() const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 atan2(const Vec256& b) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 erf() const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 erfc() const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 expm1() const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 operator<(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 operator<=(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 operator>(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 operator>=(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 eq(const Vec256& other) const { + auto ret = (*this == other); + return ret & vd_one; + } + Vec256 ne(const Vec256& other) const { + auto ret = (*this != other); + return ret & vd_one; + } + + Vec256 lt(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 le(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 gt(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + Vec256 ge(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + DEFINE_MEMBER_OP(operator==, ComplexDbl, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, ComplexDbl, vec_cmpne) + + DEFINE_MEMBER_OP(operator+, ComplexDbl, vec_add) + DEFINE_MEMBER_OP(operator-, ComplexDbl, vec_sub) + DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and) + DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or) + DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor) + // elelemtwise helpers + DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul) + DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div) + DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt) + DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge) + DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt) + DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ); + // auto max = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_lt(abs_b); + auto max = Vec256::elwise_blendv(a, b, mask); + + return max; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(max, isnan); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ); + // auto min = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_gt(abs_b); + auto min = Vec256::elwise_blendv(a, b, mask); + return min; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(min, isnan); +} + + +} // namespace +} // namespace vec256 +} // namespace at + diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h new file mode 100644 index 000000000000..cb9b4c90fbe0 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_complex_float_vsx.h @@ -0,0 +1,670 @@ + +#pragma once +#include +#include +#include +#include + +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] +namespace { +using ComplexFlt = c10::complex; + +template <> +class Vec256 { + private: + union { + struct { + vfloat32 _vec0; + vfloat32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = ComplexFlt; + using vec_internal_type = vfloat32; + using vec_internal_mask_type = vbool32; + + static constexpr int size() { + return 4; + } + Vec256() {} + + C10_ALWAYS_INLINE Vec256(vfloat32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} + + Vec256(ComplexFlt val) { + float real_value = val.real(); + float imag_value = val.imag(); + _vec0 = vfloat32{real_value, imag_value, real_value, imag_value}; + _vec1 = vfloat32{real_value, imag_value, real_value, imag_value}; + } + + Vec256(ComplexFlt val1, ComplexFlt val2, ComplexFlt val3, ComplexFlt val4) { + _vec0 = vfloat32{val1.real(), val1.imag(), val2.real(), val2.imag()}; + _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxComplexMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxComplexMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_2nd = VsxComplexMask2(mask); + // generated masks + return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_2nd = VsxComplexMask2(mask); + // generated masks + return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxComplexMask1(mask); + const vbool32 mask_2nd = VsxComplexMask2(mask); + return { + (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static Vec256 C10_ALWAYS_INLINE + el_blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxMask1(mask); + const vbool32 mask_2nd = VsxMask2(mask); + return { + (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vec256 blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // convert std::complex index mask to V index mask: xy -> xxyy + auto mask_complex = Vec256( + vec_mergeh(mask._vec0, mask._vec0), vec_mergeh(mask._vec1, mask._vec1)); + // mask_complex.dump(); + return { + vec_sel(a._vec0, b._vec0, mask_complex._vec0), + vec_sel(a._vec1, b._vec1, mask_complex._vec1), + }; + } + + static Vec256 elwise_blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + return { + vec_sel(a._vec0, b._vec0, mask._vec0), + vec_sel(a._vec1, b._vec1, mask._vec1), + }; + } + + template + static Vec256 arange( + ComplexFlt base = 0., + step_t step = static_cast(1)) { + return Vec256( + base, + base + step, + base + ComplexFlt(2) * step, + base + ComplexFlt(3) * step); + } + static Vec256 set( + const Vec256& a, + const Vec256& b, + int64_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + return b; + } + + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return { + vec_vsx_ld(offset0, reinterpret_cast(tmp_values)), + vec_vsx_ld(offset16, reinterpret_cast(tmp_values))}; + } + + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, reinterpret_cast(tmp_values)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(tmp_values)); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + const ComplexFlt& operator[](int idx) const = delete; + ComplexFlt& operator[](int idx) = delete; + + Vec256 map(ComplexFlt (*f)(ComplexFlt)) const { + __at_align32__ ComplexFlt tmp[size()]; + store(tmp); + for (int i = 0; i < size(); i++) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + Vec256 map(ComplexFlt (*f)(const ComplexFlt&)) const { + __at_align32__ ComplexFlt tmp[size()]; + store(tmp); + for (int i = 0; i < size(); i++) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + + static Vec256 horizontal_add_permD8( + Vec256& first, + Vec256& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.el_swapped(); // 2perm + auto second_perm = second.el_swapped(); // 2perm + // summ + auto first_ret = first + first_perm; // 2add + auto second_ret = second + second_perm; // 2 add + // now lets choose evens + return el_mergee(first_ret, second_ret); // 2 mergee's + } + + static Vec256 horizontal_sub_permD8( + Vec256& first, + Vec256& second) { + // we will simulate it differently with 6 instructions total + // lets permute second so that we can add it getting horizontal sums + auto first_perm = first.el_swapped(); // 2perm + auto second_perm = second.el_swapped(); // 2perm + // summ + auto first_ret = first - first_perm; // 2sub + auto second_ret = second - second_perm; // 2 sub + // now lets choose evens + return el_mergee(first_ret, second_ret); // 2 mergee's + } + + Vec256 abs_2_() const { + auto a = (*this).elwise_mult(*this); + auto permuted = a.el_swapped(); + a = a + permuted; + return a.el_mergee(); + } + + Vec256 abs_() const { + auto ret = abs_2_(); + return ret.elwise_sqrt(); + } + + Vec256 abs() const { + return abs_() & real_mask; + } + + Vec256 real_() const { + return *this & real_mask; + } + Vec256 real() const { + return *this & real_mask; + } + Vec256 imag_() const { + return *this & imag_mask; + } + Vec256 imag() const { + // we can use swap_mask or sldwi + auto ret = imag_(); + return { + vec_sldw(ret._vec0, ret._vec0, 3), vec_sldw(ret._vec1, ret._vec1, 3)}; + } + + Vec256 conj_() const { + return *this ^ isign_mask; + } + Vec256 conj() const { + return *this ^ isign_mask; + } + + Vec256 log() const { + // Most trigonomic ops use the log() op to improve complex number + // performance. + return map(std::log); + } + + Vec256 log2() const { + // log2eB_inv + auto ret = log(); + return ret.elwise_mult(log2e_inv); + } + Vec256 log10() const { + auto ret = log(); + return ret.elwise_mult(log10e_inv); + } + + Vec256 el_swapped() const { + vfloat32 v0 = vec_perm(_vec0, _vec0, swap_mask); + vfloat32 v1 = vec_perm(_vec1, _vec1, swap_mask); + return {v0, v1}; + } + + Vec256 el_mergee() const { + // as mergee phased in , we can use vec_perm with mask + return {vec_mergee(_vec0, _vec0), vec_mergee(_vec1, _vec1)}; + } + + Vec256 el_mergeo() const { + // as mergeo phased in , we can use vec_perm with mask + return {vec_mergeo(_vec0, _vec0), vec_mergeo(_vec1, _vec1)}; + } + + Vec256 el_madd( + const Vec256& multiplier, + const Vec256& val) const { + return { + vec_madd(_vec0, multiplier._vec0, val._vec0), + vec_madd(_vec1, multiplier._vec1, val._vec1)}; + } + + static Vec256 el_mergee( + Vec256& first, + Vec256& second) { + // as mergee phased in , we can use vec_perm with mask + return { + vec_mergee(first._vec0, second._vec0), + vec_mergee(first._vec1, second._vec1)}; + } + + Vec256 angle_() const { + // angle = atan2(b/a) + // auto b_a = _mm256_permute_ps(values, 0xB1); // b a + // return Sleef_atan2f8_u10(values, b_a); // 90-angle angle + auto ret = el_swapped(); + for (int i = 0; i < 4; i++) { + ret._vec0[i] = std::atan2(_vec0[i], ret._vec0[i]); + ret._vec1[i] = std::atan2(_vec1[i], ret._vec0[i]); + } + return ret; + } + + Vec256 angle() const { + auto a = angle_().el_swapped(); + return a & real_mask; + } + + Vec256 sin() const { + return map(std::sin); + } + Vec256 sinh() const { + return map(std::sinh); + } + Vec256 cos() const { + return map(std::cos); + } + Vec256 cosh() const { + return map(std::cosh); + } + Vec256 ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vec256 floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vec256 neg() const { + auto z = Vec256(zero); + return z - *this; + } + Vec256 round() const { + return {vec_round(_vec0), vec_round(_vec1)}; + } + Vec256 tan() const { + return map(std::tan); + } + Vec256 tanh() const { + return map(std::tanh); + } + Vec256 trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vec256 elwise_sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + + void dump() const { + std::cout << _vec0[0] << "," << _vec0[1] << "," << _vec0[2] << "," + << _vec0[3] << ","; + std::cout << _vec1[0] << "," << _vec1[1] << "," << _vec1[2] << "," + << _vec1[3] << std::endl; + } + + Vec256 sqrt() const { + return map(std::sqrt); + } + + Vec256 reciprocal() const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() = c/abs_2() + // im = (bc - ad)/abs_2() = d/abs_2() + auto c_d = *this ^ isign_mask; // c -d + auto abs = abs_2_(); + return c_d.elwise_div(abs); + } + + Vec256 rsqrt() const { + return sqrt().reciprocal(); + } + + Vec256 pow(const Vec256& exp) const { + __at_align32__ ComplexFlt x_tmp[size()]; + __at_align32__ ComplexFlt y_tmp[size()]; + store(x_tmp); + exp.store(y_tmp); + for (int i = 0; i < size(); i++) { + x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); + } + return loadu(x_tmp); + } + + Vec256 atan() const { + // atan(x) = i/2 * ln((i + z)/(i - z)) + auto ione = Vec256(imag_one); + auto sum = ione + *this; + auto sub = ione - *this; + auto ln = (sum / sub).log(); // ln((i + z)/(i - z)) + return ln * imag_half; // i/2*ln() + } + + Vec256 acos() const { + // acos(x) = pi/2 - asin(x) + return Vec256(pi_2) - asin(); + } + + Vec256 inline operator*(const Vec256& b) const { + //(a + bi) * (c + di) = (ac - bd) + (ad + bc)i + +#if 1 + // this is more vsx friendly than simulating horizontal from x86 + + auto vi = b.el_mergeo(); + auto vr = b.el_mergee(); + vi = vi ^ rsign_mask; + auto ret = elwise_mult(vr); + auto vx_swapped = el_swapped(); + ret = vx_swapped.el_madd(vi, ret); + return ret; + +#else + + auto ac_bd = elwise_mult(b); + auto d_c = b.el_swapped(); + d_c = d_c ^ isign_mask; + auto ad_bc = elwise_mult(d_c); + auto ret = horizontal_sub_permD8(ac_bd, ad_bc); + return ret; +#endif + } + + Vec256 inline operator/(const Vec256& b) const { + // re + im*i = (a + bi) / (c + di) + // re = (ac + bd)/abs_2() + // im = (bc - ad)/abs_2() +#if 1 + auto vi = b.el_mergeo(); + auto vr = b.el_mergee(); + auto abs_b = b.abs_2_(); + vi = vi ^ isign_mask; + auto ret = elwise_mult(vr); + auto vx_swapped = el_swapped(); + ret = vx_swapped.el_madd(vi, ret); + ret = ret.elwise_div(abs_b); +#else + // Vec256 x86 simulation + auto ac_bd = elwise_mult(b); + auto d_c = b.el_swapped(); + d_c = d_c ^ rsign_mask; + auto ad_bc = elwise_mult(d_c); + auto abs_b = b.abs_2_(); + auto re_im = horizontal_add_permD8(ac_bd, ad_bc); + auto ret = re_im.elwise_div(abs_b); +#endif + return ret; + } + + Vec256 asin() const { + // asin(x) + // = -i*ln(iz + sqrt(1 -z^2)) + // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) + // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) + +#if 1 + auto conj = conj_(); + auto b_a = conj.el_swapped(); + auto ab = conj.elwise_mult(b_a); + auto im = ab + ab; + auto val_2 = (*this).elwise_mult(*this); + auto val_2_swapped = val_2.el_swapped(); + auto re = horizontal_sub_permD8(val_2, val_2_swapped); + re = Vec256(one) - re; + auto root = el_blend<0xAA>(re, im).sqrt(); + auto ln = (b_a + root).log(); + return ln.el_swapped().conj(); +#else + return map(std::asin); +#endif + } + + Vec256 exp() const { + return map(std::exp); + } + + Vec256 eq(const Vec256& other) const { + auto ret = (*this == other); + return ret & one; + } + Vec256 ne(const Vec256& other) const { + auto ret = (*this != other); + return ret & one; + } + + Vec256 sgn() const { + return map(at::native::sgn_impl); + } + + Vec256 hypot(const Vec256& b) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 nextafter(const Vec256& b) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 igamma(const Vec256& x) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 igammac(const Vec256& x) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 atan2(const Vec256& b) const { + TORCH_CHECK(false,"not supported for complex numbers"); + } + Vec256 erf() const { + TORCH_CHECK(false,"not supported for complex numbers"); + } + Vec256 erfc() const { + TORCH_CHECK(false,"not supported for complex numbers"); + } + + Vec256 log1p() const { + TORCH_CHECK(false,"not supported for complex numbers"); + } + + Vec256 expm1() const { + TORCH_CHECK(false,"not supported for complex numbers"); + } + + Vec256 operator<(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 operator<=(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 operator>(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 operator>=(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 lt(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 le(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 gt(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + Vec256 ge(const Vec256& other) const { + TORCH_CHECK(false, "not supported for complex numbers"); + } + + DEFINE_MEMBER_OP(operator==, ComplexFlt, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, ComplexFlt, vec_cmpne) + + DEFINE_MEMBER_OP(operator+, ComplexFlt, vec_add) + DEFINE_MEMBER_OP(operator-, ComplexFlt, vec_sub) + DEFINE_MEMBER_OP(operator&, ComplexFlt, vec_and) + DEFINE_MEMBER_OP(operator|, ComplexFlt, vec_or) + DEFINE_MEMBER_OP(operator^, ComplexFlt, vec_xor) + // elelemtwise helpers + DEFINE_MEMBER_OP(elwise_mult, ComplexFlt, vec_mul) + DEFINE_MEMBER_OP(elwise_div, ComplexFlt, vec_div) + DEFINE_MEMBER_OP(elwise_gt, ComplexFlt, vec_cmpgt) + DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge) + DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt) + DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_LT_OQ); + // auto max = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_lt(abs_b); + auto max = Vec256::elwise_blendv(a, b, mask); + + return max; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(max, isnan); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + auto abs_a = a.abs_2_(); + auto abs_b = b.abs_2_(); + // auto mask = _mm256_cmp_ps(abs_a, abs_b, _CMP_GT_OQ); + // auto min = _mm256_blendv_ps(a, b, mask); + auto mask = abs_a.elwise_gt(abs_b); + auto min = Vec256::elwise_blendv(a, b, mask); + return min; + // Exploit the fact that all-ones is a NaN. + // auto isnan = _mm256_cmp_ps(abs_a, abs_b, _CMP_UNORD_Q); + // return _mm256_or_ps(min, isnan); +} + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h new file mode 100644 index 000000000000..f34bdc7bbcb3 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_double_vsx.h @@ -0,0 +1,392 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace vec256 { + +namespace { + + +template <> +class Vec256 { + private: + union { + struct { + vfloat64 _vec0; + vfloat64 _vec1; + }; + struct { + vbool64 _vecb0; + vbool64 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = double; + using vec_internal_type = vfloat64; + using vec_internal_mask_type = vbool64; + static constexpr int size() { + return 4; + } + Vec256() {} + C10_ALWAYS_INLINE Vec256(vfloat64 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vfloat64 v1, vfloat64 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vec256(double scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vec256( + double scalar1, + double scalar2, + double scalar3, + double scalar4) + : _vec0{vfloat64{scalar1, scalar2}}, _vec1{vfloat64{scalar3, scalar4}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + int zero_mask() const { + auto cmp = (*this == vd_zero); + return (cmp._vecb0[0] & 1) | (cmp._vecb0[1] & 2) | (cmp._vecb1[0] & 4) | + (cmp._vecb1[1] & 8); + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return { b._vec0, a._vec1 }; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return { a._vec0, b._vec1 }; + } + + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1 }; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + return { (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1 }; + } + + + template + static std::enable_if_t> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + const vbool64 mask_2nd = VsxDblMask2(mask); + // generated masks + return { a._vec0, + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) }; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + const vbool64 mask_2nd = VsxDblMask2(mask); + // generated masks + return { b._vec0, + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) }; + } + + template + static std::enable_if_t> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + const vbool64 mask_1st = VsxDblMask1(mask); + const vbool64 mask_2nd = VsxDblMask2(mask); + return { + (vfloat64)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat64)vec_sel(a._vec1, b._vec1, mask_2nd) }; + } + + + static Vec256 C10_ALWAYS_INLINE blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // the mask used here returned by comparision of vec256 + + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + static Vec256 arange(double base = 0., double step = 1.) { + return Vec256(base, base + step, base + 2 * step, base + 3 * step); + } + + static Vec256 C10_ALWAYS_INLINE + set(const Vec256& a, const Vec256& b, size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + + return b; + } + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const double& operator[](int idx) const = delete; + double& operator[](int idx) = delete; + void dump() const { + std::cout << _vec0[0] << "," << _vec0[1] << "," << _vec1[0] << "," << _vec1[1] << std::endl; + } + Vec256 map(double (*f)(double)) const { + Vec256 ret; + for (int i = 0; i < size()/2; i++) { + ret._vec0[i] = f(_vec0[i]); + } + for (int i = 0; i < size()/2; i++) { + ret._vec1[i] = f(_vec1[i]); + } + return ret; + } + + Vec256 mapbi(double (*f)(double, double), const Vec256& other) + const { + Vec256 ret; + for (int i = 0; i < size()/2; i++) { + ret._vec0[i] = f(_vec0[i], other._vec0[i]); + } + for (int i = 0; i < size()/2; i++) { + ret._vec1[i] = f(_vec1[i], other._vec1[i]); + } + return ret; + } + Vec256 C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE acos() const { + return {Sleef_acosd2_u10vsx(_vec0), Sleef_acosd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE asin() const { + return {Sleef_asind2_u10vsx(_vec0), Sleef_asind2_u10vsx(_vec1)}; + } + Vec256 atan() const { + return {Sleef_atand2_u10vsx(_vec0), Sleef_atand2_u10vsx(_vec1)}; + } + Vec256 atan2(const Vec256& b) const { + return {Sleef_atan2d2_u10vsx(_vec0, b._vec0), Sleef_atan2d2_u10vsx(_vec1, b._vec1)}; + } + Vec256 erf() const { + return {Sleef_erfd2_u10vsx(_vec0), Sleef_erfd2_u10vsx(_vec1)}; + } + Vec256 erfc() const { + return {Sleef_erfcd2_u15vsx(_vec0), Sleef_erfcd2_u15vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE exp() const { + return {Sleef_expd2_u10vsx(_vec0), Sleef_expd2_u10vsx(_vec1)}; + } + Vec256 expm1() const { + return {Sleef_expm1d2_u10vsx(_vec0), Sleef_expm1d2_u10vsx(_vec1)}; + } + + Vec256 lgamma() const __ubsan_ignore_undefined__ { + return {Sleef_lgammad2_u10vsx(_vec0), Sleef_lgammad2_u10vsx(_vec1)}; + } + + Vec256 erfinv() const { + return map(calc_erfinv); + } + + Vec256 angle() const { + return Vec256{0}; + } + Vec256 real() const { + return *this; + } + Vec256 imag() const { + return Vec256{0}; + } + Vec256 conj() const { + return *this; + } + + Vec256 C10_ALWAYS_INLINE log() const { + return {Sleef_logd2_u10vsx(_vec0), Sleef_logd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE log10() const { + return {Sleef_log10d2_u10vsx(_vec0), Sleef_log10d2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE log1p() const { + return {Sleef_log1pd2_u10vsx(_vec0), Sleef_log1pd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE log2() const { + return {Sleef_log2d2_u10vsx(_vec0), Sleef_log2d2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE cos() const { + return {Sleef_cosd2_u10vsx(_vec0), Sleef_cosd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE cosh() const { + return {Sleef_coshd2_u10vsx(_vec0), Sleef_coshd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE round() const { + return {vec_rint(_vec0), vec_rint(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE sin() const { + return {Sleef_sind2_u10vsx(_vec0), Sleef_sind2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE sinh() const { + return {Sleef_sinhd2_u10vsx(_vec0), Sleef_sinhd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE tan() const { + return {Sleef_tand2_u10vsx(_vec0), Sleef_tand2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE tanh() const { + return {Sleef_tanhd2_u10vsx(_vec0), Sleef_tanhd2_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE frac() const { + return *this - trunc(); + } + + Vec256 C10_ALWAYS_INLINE sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE reciprocal() const { + return { + vec_div(vd_one, _vec0), // vec_re(_vec0) is estimated one. + vec_div(vd_one, _vec1)}; + } + Vec256 C10_ALWAYS_INLINE rsqrt() const { + return sqrt().reciprocal(); + } + + Vec256 C10_ALWAYS_INLINE pow(const Vec256& b) const { + return {Sleef_powd2_u10vsx(_vec0, b._vec0), Sleef_powd2_u10vsx(_vec1, b._vec1)}; + } + Vec256 C10_ALWAYS_INLINE fmod(const Vec256& b) const { + return {Sleef_fmodd2_vsx(_vec0, b._vec0),Sleef_fmodd2_vsx(_vec1, b._vec1)}; + } + + Vec256 hypot(const Vec256& b) const { + return {Sleef_hypotd2_u05vsx(_vec0, b._vec0), Sleef_hypotd2_u05vsx(_vec1, b._vec1)}; + } + + Vec256 nextafter(const Vec256& b) const { + return {Sleef_nextafterd2_vsx(_vec0, b._vec0), Sleef_nextafterd2_vsx(_vec1, b._vec1)}; + } + + Vec256 igamma(const Vec256& x) const { + return mapbi(calc_igamma, x); + } + + Vec256 igammac(const Vec256& x) const { + return mapbi(calc_igammac, x); + } + + + Vec256 i0() const { + return map(calc_i0); + } + + DEFINE_MEMBER_OP(operator==, double, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, double, vec_cmpne) + DEFINE_MEMBER_OP(operator<, double, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, double, vec_cmple) + DEFINE_MEMBER_OP(operator>, double, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, double, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, double, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, double, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, double, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, double, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, double, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, double, vec_cmpge) + DEFINE_MEMBER_OP(operator+, double, vec_add) + DEFINE_MEMBER_OP(operator-, double, vec_sub) + DEFINE_MEMBER_OP(operator*, double, vec_mul) + DEFINE_MEMBER_OP(operator/, double, vec_div) + DEFINE_MEMBER_OP(maximum, double, vec_max) + DEFINE_MEMBER_OP(minimum, double, vec_min) + DEFINE_MEMBER_OP(operator&, double, vec_and) + DEFINE_MEMBER_OP(operator|, double, vec_or) + DEFINE_MEMBER_OP(operator^, double, vec_xor) + DEFINE_MEMBER_TERNARY_OP(madd, double, vec_madd) +}; +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_float_vsx.h new file mode 100644 index 000000000000..2a1a87aa72c8 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_float_vsx.h @@ -0,0 +1,676 @@ +#pragma once + +#include +#include +#include +#include +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] + +namespace { + +template <> +class Vec256 { + private: + union { + struct { + vfloat32 _vec0; + vfloat32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = float; + using vec_internal_type = vfloat32; + using vec_internal_mask_type = vbool32; + + static constexpr int size() { + return 8; + } + Vec256() {} + + C10_ALWAYS_INLINE Vec256(vfloat32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vfloat32 v1, vfloat32 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vec256(float scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vec256( + float scalar1, + float scalar2, + float scalar3, + float scalar4, + float scalar5, + float scalar6, + float scalar7, + float scalar8) + : _vec0{vfloat32{scalar1, scalar2, scalar3, scalar4}}, + _vec1{vfloat32{scalar5, scalar6, scalar7, scalar8}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {a._vec0, b._vec1}; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), a._vec1}; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxMask1(mask); + return {(vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), b._vec1}; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_2nd = VsxMask2(mask); + // generated masks + return {a._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_2nd = VsxMask2(mask); + // generated masks + return {b._vec0, (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + const vbool32 mask_1st = VsxMask1(mask); + const vbool32 mask_2nd = VsxMask2(mask); + return { + (vfloat32)vec_sel(a._vec0, b._vec0, mask_1st), + (vfloat32)vec_sel(a._vec1, b._vec1, mask_2nd)}; + } + + static Vec256 C10_ALWAYS_INLINE blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // the mask used here returned by comparision of vec256 + // assuming this we can use the same mask directly with vec_sel + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + + static Vec256 arange(float base = 0.f, float step = 1.f) { + return Vec256( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vec256 set( + const Vec256& a, + const Vec256& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + + return b; + } + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + const float& operator[](int idx) const = delete; + float& operator[](int idx) = delete; + + Vec256 map(float (*f)(float)) const { + Vec256 ret; + for (int i = 0; i < size() / 2; i++) { + ret._vec0[i] = f(_vec0[i]); + } + for (int i = 0; i < size() / 2; i++) { + ret._vec1[i] = f(_vec1[i]); + } + return ret; + } + + Vec256 mapbi(float (*f)(float, float), const Vec256& other) + const { + Vec256 ret; + for (int i = 0; i < size() / 2; i++) { + ret._vec0[i] = f(_vec0[i], other._vec0[i]); + } + for (int i = 0; i < size() / 2; i++) { + ret._vec1[i] = f(_vec1[i], other._vec1[i]); + } + return ret; + } + + Vec256 _nor() const { + return {vec_nor(_vec0, _vec0), vec_nor(_vec1, _vec1)}; + } + + Vec256 _isnan() const { + auto x = *this; + auto ret = (x == x); + return ret._nor(); + } + + Vec256 _isinf() const { + auto x = *this; + return (x == v_inf) | (x == v_minus_inf); + } + + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + //__m256 cmp = _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_EQ_OQ); + auto cmp = (*this == zero); + // return _mm256_movemask_ps(cmp); + // possible simulation //mask= lvsl ( 0 ) vbpermq( vec, mask <<5) + vuint64 result0 = vec_vbpermq((vuint8)cmp._vecb0, mask_zero_bits); + vuint64 result1 = vec_vbpermq((vuint8)cmp._vecb1, mask_zero_bits); + return (result0[1] >> 12 | (result1[1] >> 8)); + } + + Vec256 C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE acos() const { + return {Sleef_acosf4_u10vsx(_vec0), Sleef_acosf4_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE asin() const { + return {Sleef_asinf4_u10vsx(_vec0), Sleef_asinf4_u10vsx(_vec1)}; + } + Vec256 atan() const { + return {Sleef_atanf4_u10vsx(_vec0), Sleef_atanf4_u10vsx(_vec1)}; + } + Vec256 atan2(const Vec256& b) const { + return {Sleef_atan2f4_u10vsx(_vec0, b._vec0), Sleef_atan2f4_u10vsx(_vec1, b._vec1)}; + } + + Vec256 lgamma() const { + return {Sleef_lgammaf4_u10vsx(_vec0), Sleef_lgammaf4_u10vsx(_vec1)}; + } + Vec256 erf() const { + return {Sleef_erff4_u10vsx(_vec0), Sleef_erff4_u10vsx(_vec1)}; + } + + Vec256 erfc() const { + return {Sleef_erfcf4_u15vsx(_vec0), Sleef_erfcf4_u15vsx(_vec1)}; + } + + Vec256 erfinv() const { + return map(calc_erfinv); + } + + Vec256 angle() const { + return Vec256{0}; + } + Vec256 real() const { + return *this; + } + Vec256 imag() const { + return Vec256{0}; + } + Vec256 conj() const { + return *this; + } + + Vec256 C10_ALWAYS_INLINE exp() const { + // implementation logic from avx_mathfun with some modifications from sleef + // Express e**x = e**g 2**n + /// = e**g e**( n loge(2) ) + /// = e**( g + n loge(2) ) + // + auto tmp_x = *this; + auto fx = (tmp_x * log2e_inv).round(); + + auto x = fx.madd(negln2f_hi, tmp_x); + x = fx.madd(negln2f_lo, x); + auto z = x * x; + auto y = x.madd(exp_p0, exp_p1); + y = y.madd(x, exp_p2); + y = y.madd(x, exp_p3); + y = y.madd(x, exp_p4); + y = y.madd(x, exp_p5); + y = y.madd(z, x) + one; + + // vm_pow2n 2^n + vint32 imm0 = vec_signed(fx._vec0); + vint32 imm1 = vec_signed(fx._vec1); + // this pow2n logic is from Sleef code + vint32 imm00 = imm0 >> 1; //>>1 + vint32 imm01 = imm1 >> 1; + vint32 imm10 = imm0 - imm00; + vint32 imm11 = imm1 - imm01; + imm00 = (imm00 + v0x7f) << vu_23; + imm01 = (imm01 + v0x7f) << vu_23; + imm10 = (imm10 + v0x7f) << vu_23; + imm11 = (imm11 + v0x7f) << vu_23; + // treat imm as float vector without conversion + + y._vec0 = (y._vec0 * (vfloat32)imm00) * (vfloat32)imm10; + y._vec1 = (y._vec1 * (vfloat32)imm01) * (vfloat32)imm11; + // boundary check + auto tmp = blendv(y, v_inf, (Vec256(exp_hi) <= tmp_x)); + y = blendv(tmp, zero, (tmp_x < Vec256(exp_lo))); + + return y; + } + Vec256 expm1() const { + return exp() - one; + } + + Vec256 C10_ALWAYS_INLINE log() const { + auto temp = *this; + auto invalid_mask = temp < zero; + // cut off denormalized stuff + auto x = temp.maximum(min_norm_pos); + vint32 imm0 = vec_sr(vint32(x._vec0), vu_23); + vint32 imm1 = vec_sr(vint32(x._vec1), vu_23); + // keep only the fractional part + x = x & inv_mant_mask; + x = x | half; + imm0 = imm0 - v0x7f; + imm1 = imm1 - v0x7f; + Vec256 ex; + ex._vec0 = vec_float(imm0); + ex._vec1 = vec_float(imm1); + ex = ex + one; + auto mask = x < cephes_SQRTHF; + auto t = x & mask; + x = x - one; + ex = ex - (mask & one); + x = x + t; + auto z = x * x; + auto y = x.madd(log_p0, log_p1); + y = y.madd(x, log_p2); + y = y.madd(x, log_p3); + y = y.madd(x, log_p4); + y = y.madd(x, log_p5); + y = y.madd(x, log_p6); + y = y.madd(x, log_p7); + y = y.madd(x, log_p8); + y = y * x * z; + y = ex.madd(log_q1, y); + y = y - z * half; + x = x + y; + x = ex.madd(log_q2, x); + // negative arg will be NAN + x = blendv(x, v_nan, invalid_mask); + // zero is -inf + x = blendv(x, min_inf, (temp == zero)); + return x; + } + Vec256 C10_ALWAYS_INLINE log10() const { + return log() * log10e_inv; + } + Vec256 C10_ALWAYS_INLINE log1p() const { + return ((*this) + one).log(); + } + Vec256 C10_ALWAYS_INLINE log2() const { + return log() * log2e_inv; + } + Vec256 C10_ALWAYS_INLINE ceil() const { + return {vec_ceil(_vec0), vec_ceil(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE cos() const { + // take the absolute value + auto x = abs(); + // extract the sign bit (upper one) + auto sign_bit = (*this) & sign_mask; + // scale by 4/Pi + auto y = x * _4div_pi; + // store the integer part of y in mm0 + // j=(j+1) & (~1) (see the cephes sources) + vint32 imm0 = (vec_signed(y._vec0) + vi_1) & vi_inv1; + vint32 imm1 = (vec_signed(y._vec1) + vi_1) & vi_inv1; + y._vec0 = vec_float(imm0); + y._vec1 = vec_float(imm1); + + imm0 = imm0 - vi_2; + imm1 = imm1 - vi_2; + Vec256 poly_mask; + // get the swap sign flag + vint32 tmp0 = vec_and(vec_nand(imm0, imm0), vi_4); + vint32 tmp1 = vec_and(vec_nand(imm1, imm1), vi_4); + sign_bit._vecb0 = (vbool32)vec_sl(tmp0, vu_29); + sign_bit._vecb1 = (vbool32)vec_sl(tmp1, vu_29); + // get the polynom selection mask + // there is one polynom for 0 <= x <= Pi / 4 + // and another one for Pi / 4 < x <= Pi / 2 + // Both branches will be computed. + + poly_mask._vecb0 = (vbool32)vec_cmpeq((imm0 & vi_2), vi_0); + poly_mask._vecb1 = (vbool32)vec_cmpeq((imm1 & vi_2), vi_0); + + // The magic pass: "Extended precision modular arithmetic" + // x = ((x - y * DP1) - y * DP2) - y * DP3; + x = y.madd(minus_cephes_dp1, x); + x = y.madd(minus_cephes_dp2, x); + x = y.madd(minus_cephes_dp3, x); + + // Evaluate the first polynom (0 <= x <= Pi/4) + auto z = x * x; + y = z.madd(coscof_p0, coscof_p1); + y = y.madd(z, coscof_p2); + y = y * z * z; + y = y - z * half + one; + + // Evaluate the second polynom (Pi/4 <= x <= 0) + auto y_2 = z.madd(sincof_p0, sincof_p1); + y_2 = y_2.madd(z, sincof_p2); + y_2 = y_2 * z; + y_2 = y_2.madd(x, x); + + // select the correct result from the two polynoms + y = blendv(y, y_2, poly_mask); + // update the sign + y = y ^ sign_bit; + + return y; + } + Vec256 C10_ALWAYS_INLINE cosh() const { + // cosh = 1/2 * (e^x + e^-x) + auto x = abs(); + auto e_x = x.exp(); + auto ret = (e_x + Vec256(one) / e_x) * half; + // inf and nan checks +#if 0 + ret = blendv(ret, v_inf, x >= vf_89); + ret = blendv(ret, v_inf, ret._isnan()); + ret = blendv(ret, v_nan, this->_isnan()); +#endif + return ret; + } + Vec256 C10_ALWAYS_INLINE floor() const { + return {vec_floor(_vec0), vec_floor(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + void dump() const { + std::cout << _vec0[0] << "," << _vec0[1] << "," << _vec0[2] << "," + << _vec0[3] << ","; + std::cout << _vec1[0] << "," << _vec1[1] << "," << _vec1[2] << "," + << _vec1[3] << std::endl; + } + + Vec256 C10_ALWAYS_INLINE round() const { + return {vec_round(_vec0), vec_round(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE sin() const { + // take the absolute value and xtract sign + auto x = abs(); + auto sign_bit = (*this) & sign_mask; + + // scale by 4/Pi + auto y = x * _4div_pi; + // store the integer part of y in mm0 + + // j=(j+1) & (~1) (see the cephes sources) + vint32 imm0 = (vec_signed(y._vec0) + vi_1) & vi_inv1; + vint32 imm1 = (vec_signed(y._vec1) + vi_1) & vi_inv1; + y._vec0 = vec_float(imm0); + y._vec1 = vec_float(imm1); + // get the swap sign flag + Vec256 swap_sign_bit, poly_mask; + swap_sign_bit._vecb0 = (vbool32)vec_sl(imm0 & vi_4, vu_29); + swap_sign_bit._vecb1 = (vbool32)vec_sl(imm1 & vi_4, vu_29); + // get the polynom selection mask + // there is one polynom for 0 <= x <= Pi/4 + // and another one for Pi/4 C10_ALWAYS_INLINE sinh() const { + auto temp_abs = abs(); + // get exponent + auto ret = temp_abs.exp(); + auto recp = Vec256(half) / ret; + auto v = ret * half - recp; + // extract the sign bit (upper one) + auto sign_bit = (*this) & sign_mask; + auto z = temp_abs * temp_abs; + auto y = z.madd(p0, p1); + y = y.madd(z, p2); + y = (y * z).madd(temp_abs, temp_abs); + // check and select + auto result = blendv(y, v, temp_abs > one); + return result | sign_bit; + } + Vec256 C10_ALWAYS_INLINE tan() const { + return {Sleef_tanf4_u10vsx(_vec0), Sleef_tanf4_u10vsx(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE tanh() const { + auto x = *this; + auto vabs = abs(); + // get exponent + auto exp2x = (vabs + vabs).exp(); + auto vv = Vec256(one) - Vec256(two) / (exp2x + one); + // extract the sign bit (upper one) + auto sign_bit = (*this) & sign_mask; + auto z = vabs * vabs; + auto y = z.madd(tanh_p0, tanh_p1); + auto tmp = y.madd(z, tanh_p2); + y = z.madd(tmp, tanh_p3); + tmp = y.madd(z, tanh_p4); + y = tmp * z; + tmp = y.madd(x, x); + // add sign + vv = vv | sign_bit; + // check and select + auto sel_mask = vabs >= tanh_0p625; + auto max_mask = vabs > tanh_half_max; + auto max_ret = sign_bit ^ one; + return blendv(blendv(tmp, vv, sel_mask), max_ret, max_mask); + } + Vec256 C10_ALWAYS_INLINE trunc() const { + return {vec_trunc(_vec0), vec_trunc(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE frac() const { + return *this - trunc(); + } + + Vec256 C10_ALWAYS_INLINE sqrt() const { + return {vec_sqrt(_vec0), vec_sqrt(_vec1)}; + } + Vec256 C10_ALWAYS_INLINE reciprocal() const { + return Vec256(one) / (*this); + } + Vec256 C10_ALWAYS_INLINE rsqrt() const { + return sqrt().reciprocal(); + } + + Vec256 C10_ALWAYS_INLINE pow(const Vec256& exp) const { + auto x = *this; + auto sign_bit = (*this) & sign_mask; + // |b| + auto exp_abs = exp.abs(); + auto exp_trunc = exp.trunc(); + Vec256 odd_mask; + odd_mask._vecb0 = (vec_signed(exp._vec0) & vi_1) != vi_0; + odd_mask._vecb1 = (vec_signed(exp._vec1) & vi_1) != vi_0; + // using ln fuction + auto temp = (abs().log() * exp).exp(); + + // is odd or even check from Sleef + auto is_int = (exp == exp_trunc) | (exp_abs >= vcheck); + auto is_odd = odd_mask & is_int & (exp_abs < vcheck); + // if even then then pow result should be absolute + auto temp_sign = temp | sign_bit; // copy_sign + auto out = blendv(temp, temp_sign, is_odd); + // x<0 and y != N, then NAN + auto out1 = blendv(out, v_nan, ((exp.floor() != exp) & (x < zero))); + // y = 0 then 1 + return blendv(out1, one, (exp_abs == zero)); + } + + Vec256 fmod(const Vec256& b) const { + return {Sleef_fmodf4_vsx(_vec0, b._vec0),Sleef_fmodf4_vsx(_vec1, b._vec1)}; + } + + Vec256 hypot(const Vec256& b) const { + return {Sleef_hypotf4_u05vsx(_vec0, b._vec0), Sleef_hypotf4_u05vsx(_vec1, b._vec1)}; + } + + Vec256 nextafter(const Vec256& b) const { + return {Sleef_nextafterf4_vsx(_vec0, b._vec0), Sleef_nextafterf4_vsx(_vec1, b._vec1)}; + } + + Vec256 igamma(const Vec256& x) const { + return mapbi(calc_igamma, x); + } + + Vec256 igammac(const Vec256& x) const { + return mapbi(calc_igammac, x); + } + + Vec256 i0() const { + return map(calc_i0); + } + + DEFINE_MEMBER_OP(operator==, float, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, float, vec_cmpne) + DEFINE_MEMBER_OP(operator<, float, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, float, vec_cmple) + DEFINE_MEMBER_OP(operator>, float, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, float, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, float, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, float, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, float, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, float, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, float, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, float, vec_cmpge) + DEFINE_MEMBER_OP(operator+, float, vec_add) + DEFINE_MEMBER_OP(operator-, float, vec_sub) + DEFINE_MEMBER_OP(operator*, float, vec_mul) + DEFINE_MEMBER_OP(operator/, float, vec_div) + DEFINE_MEMBER_OP(maximum, float, vec_max) + DEFINE_MEMBER_OP(minimum, float, vec_min) + DEFINE_MEMBER_OP(operator&, float, vec_and) + DEFINE_MEMBER_OP(operator|, float, vec_or) + DEFINE_MEMBER_OP(operator^, float, vec_xor) + DEFINE_MEMBER_TERNARY_OP(madd, float, vec_madd) +}; + +template <> +Vec256 inline maximum(const Vec256& a, const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum(const Vec256& a, const Vec256& b) { + return a.minimum(b); +} + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_int16_vsx.h new file mode 100644 index 000000000000..33460abe2a58 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_int16_vsx.h @@ -0,0 +1,351 @@ +#pragma once + +#include +#include +#include +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] +namespace { + +template <> +class Vec256 { + private: + union { + struct { + vint16 _vec0; + vint16 _vec1; + }; + struct { + vbool16 _vecb0; + vbool16 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = int16_t; + using vec_internal_type = vint16; + using vec_internal_mask_type = vbool16; + static constexpr int size() { + return 16; + } + Vec256() {} + C10_ALWAYS_INLINE Vec256(vint16 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool16 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vint16 v1, vint16 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool16 v1, vbool16 v2) : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vec256(int16_t scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + + C10_ALWAYS_INLINE Vec256( + int16_t scalar1, + int16_t scalar2, + int16_t scalar3, + int16_t scalar4, + int16_t scalar5, + int16_t scalar6, + int16_t scalar7, + int16_t scalar8, + int16_t scalar9, + int16_t scalar10, + int16_t scalar11, + int16_t scalar12, + int16_t scalar13, + int16_t scalar14, + int16_t scalar15, + int16_t scalar16) + : _vec0{vint16{ + scalar1, + scalar2, + scalar3, + scalar4, + scalar5, + scalar6, + scalar7, + scalar8}}, + _vec1{vint16{ + scalar9, + scalar10, + scalar11, + scalar12, + scalar13, + scalar14, + scalar15, + scalar16}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t<(mask & 65535) == 65535, Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t<(mask > 0 && mask < 255), Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr int16_t g0 = (mask & 1) * 0xffff; + constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff; + const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7}; + + return {(vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), a._vec1}; + } + + template + static std::enable_if_t< + (mask > 255 && (mask & 65535) != 65535 && ((mask & 255) == 255)), + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr int16_t g0_2 = (mask & 1) * 0xffff; + constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; + + const vint16 mask_2nd = + vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; + // generated masks + return {b._vec0, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) == 0)), + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr int16_t mask2 = (mask & 65535) >> 16; + constexpr int16_t g0_2 = (mask & 1) * 0xffff; + constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; + + const vint16 mask_2nd = + vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; + // generated masks + return {a, (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 255 && ((mask & 65535) != 65535) && ((mask & 255) != 0) && + ((mask & 255) != 255)), + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr int16_t g0 = (mask & 1) * 0xffff; + constexpr int16_t g1 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7 = ((mask & 128) >> 7) * 0xffff; + constexpr int16_t mask2 = (mask & 65535) >> 16; + constexpr int16_t g0_2 = (mask & 1) * 0xffff; + constexpr int16_t g1_2 = ((mask & 2) >> 1) * 0xffff; + constexpr int16_t g2_2 = ((mask & 4) >> 2) * 0xffff; + constexpr int16_t g3_2 = ((mask & 8) >> 3) * 0xffff; + constexpr int16_t g4_2 = ((mask & 16) >> 4) * 0xffff; + constexpr int16_t g5_2 = ((mask & 32) >> 5) * 0xffff; + constexpr int16_t g6_2 = ((mask & 64) >> 6) * 0xffff; + constexpr int16_t g7_2 = ((mask & 128) >> 7) * 0xffff; + + const vint16 mask_1st = vint16{g0, g1, g2, g3, g4, g5, g6, g7}; + const vint16 mask_2nd = + vint16{g0_2, g1_2, g2_2, g3_2, g4_2, g5_2, g6_2, g7_2}; + // generated masks + return { + (vint16)vec_sel(a._vec0, b._vec0, (vbool16)mask_1st), + (vint16)vec_sel(a._vec1, b._vec1, (vbool16)mask_2nd)}; + } + + static Vec256 C10_ALWAYS_INLINE blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // the mask used here returned by comparision of vec256 + // assuming this we can use the same mask directly with vec_sel + // warning intel style mask will not work properly + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + + static Vec256 arange(int16_t base = 0, int16_t step = 1) { + return Vec256( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step, + base + 8 * step, + base + 9 * step, + base + 10 * step, + base + 11 * step, + base + 12 * step, + base + 13 * step, + base + 14 * step, + base + 15 * step); + } + static Vec256 set( + const Vec256& a, + const Vec256& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + case 8: + return blend<255>(a, b); + case 9: + return blend<511>(a, b); + case 10: + return blend<1023>(a, b); + case 11: + return blend<2047>(a, b); + case 12: + return blend<4095>(a, b); + case 13: + return blend<8191>(a, b); + case 14: + return blend<16383>(a, b); + case 15: + return blend<32767>(a, b); + } + return b; + } + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy(ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const int16_t& operator[](int idx) const = delete; + int16_t& operator[](int idx) = delete; + + Vec256 angle() const { + return Vec256{0}; + } + Vec256 real() const { + return *this; + } + Vec256 imag() const { + return Vec256{0}; + } + Vec256 conj() const { + return *this; + } + + Vec256 C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + DEFINE_MEMBER_UNARY_OP(operator~, int16_t, vec_not) + DEFINE_MEMBER_OP(operator==, int16_t, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, int16_t, vec_cmpne) + DEFINE_MEMBER_OP(operator<, int16_t, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, int16_t, vec_cmple) + DEFINE_MEMBER_OP(operator>, int16_t, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, int16_t, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, int16_t, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, int16_t, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, int16_t, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, int16_t, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, int16_t, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, int16_t, vec_cmpge) + DEFINE_MEMBER_OP(operator+, int16_t, vec_add) + DEFINE_MEMBER_OP(operator-, int16_t, vec_sub) + DEFINE_MEMBER_OP(operator*, int16_t, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int16_t, /) + DEFINE_MEMBER_OP(maximum, int16_t, vec_max) + DEFINE_MEMBER_OP(minimum, int16_t, vec_min) + DEFINE_MEMBER_OP(operator&, int16_t, vec_and) + DEFINE_MEMBER_OP(operator|, int16_t, vec_or) + DEFINE_MEMBER_OP(operator^, int16_t, vec_xor) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} + + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_int32_vsx.h new file mode 100644 index 000000000000..2ee2318f0349 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_int32_vsx.h @@ -0,0 +1,281 @@ +#pragma once + +#include +#include +#include +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] +namespace { + +template <> +class Vec256 { + private: + union { + struct { + vint32 _vec0; + vint32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = int32_t; + using vec_internal_type = vint32; + using vec_internal_mask_type = vbool32; + static constexpr int size() { + return 8; + } + Vec256() {} + C10_ALWAYS_INLINE Vec256(vint32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vec256(int32_t scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vec256( + int32_t scalar1, + int32_t scalar2, + int32_t scalar3, + int32_t scalar4, + int32_t scalar5, + int32_t scalar6, + int32_t scalar7, + int32_t scalar8) + : _vec0{vint32{scalar1, scalar2, scalar3, scalar4}}, + _vec1{vint32{scalar5, scalar6, scalar7, scalar8}} {} + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t<(mask & 255) == 255, Vec256> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t<(mask > 0 && mask < 15), Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr uint32_t g0 = (mask & 1) * 0xffffffff; + constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + const vbool32 mask_1st = (vbool32){g0, g1, g2, g3}; + + return {(vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), a._vec1}; + } + + template + static std::enable_if_t< + (mask > 15 && (mask & 255) != 255 && ((mask & 15) == 15)), + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr uint32_t mask2 = (mask & 255) >> 4; + constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; + constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; + + const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; + // generated masks + return {b._vec0, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 15 && ((mask & 255) != 255) && ((mask & 15) == 0)), + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr uint32_t mask2 = (mask & 255) >> 4; + constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; + constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; + + const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; + // generated masks + return {a, (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 15 && ((mask & 255) != 255) && ((mask & 15) != 0) && + ((mask & 15) != 15)), + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr uint32_t g0 = (mask & 1) * 0xffffffff; + constexpr uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + constexpr uint32_t mask2 = (mask & 255) >> 4; + constexpr uint32_t g0_2 = (mask2 & 1) * 0xffffffff; + constexpr uint32_t g1_2 = ((mask2 & 2) >> 1) * 0xffffffff; + constexpr uint32_t g2_2 = ((mask2 & 4) >> 2) * 0xffffffff; + constexpr uint32_t g3_2 = ((mask2 & 8) >> 3) * 0xffffffff; + + const vbool32 mask_1st = (vbool32){g0, g1, g2, g3}; + const vbool32 mask_2nd = (vbool32){g0_2, g1_2, g2_2, g3_2}; + // generated masks + return { + (vint32)vec_sel(a._vec0, b._vec0, (vbool32)mask_1st), + (vint32)vec_sel(a._vec1, b._vec1, (vbool32)mask_2nd)}; + } + + static Vec256 C10_ALWAYS_INLINE blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // the mask used here returned by comparision of vec256 + // assuming this we can use the same mask directly with vec_sel + // warning intel style mask will not work properly + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + + static Vec256 arange(int32_t base = 0.f, int32_t step = 1.f) { + return Vec256( + base, + base + step, + base + 2 * step, + base + 3 * step, + base + 4 * step, + base + 5 * step, + base + 6 * step, + base + 7 * step); + } + static Vec256 set( + const Vec256& a, + const Vec256& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + case 4: + return blend<15>(a, b); + case 5: + return blend<31>(a, b); + case 6: + return blend<63>(a, b); + case 7: + return blend<127>(a, b); + } + + return b; + } + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const int32_t& operator[](int idx) const = delete; + int32_t& operator[](int idx) = delete; + + Vec256 angle() const { + return Vec256{0}; + } + Vec256 real() const { + return *this; + } + Vec256 imag() const { + return Vec256{0}; + } + Vec256 conj() const { + return *this; + } + + Vec256 C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + DEFINE_MEMBER_UNARY_OP(operator~, int32_t, vec_not) + DEFINE_MEMBER_OP(operator==, int32_t, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, int32_t, vec_cmpne) + DEFINE_MEMBER_OP(operator<, int32_t, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, int32_t, vec_cmple) + DEFINE_MEMBER_OP(operator>, int32_t, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, int32_t, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, int32_t, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, int32_t, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, int32_t, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, int32_t, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, int32_t, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, int32_t, vec_cmpge) + DEFINE_MEMBER_OP(operator+, int32_t, vec_add) + DEFINE_MEMBER_OP(operator-, int32_t, vec_sub) + DEFINE_MEMBER_OP(operator*, int32_t, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, int32_t, /) + DEFINE_MEMBER_OP(maximum, int32_t, vec_max) + DEFINE_MEMBER_OP(minimum, int32_t, vec_min) + DEFINE_MEMBER_OP(operator&, int32_t, vec_and) + DEFINE_MEMBER_OP(operator|, int32_t, vec_or) + DEFINE_MEMBER_OP(operator^, int32_t, vec_xor) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_int64_vsx.h new file mode 100644 index 000000000000..d752f71c9a63 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_int64_vsx.h @@ -0,0 +1,233 @@ +#pragma once + +#include +#include +#include +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] +namespace { + +template <> +class Vec256 { + private: + union { + struct { + vint64 _vec0; + vint64 _vec1; + }; + struct { + vbool64 _vecb0; + vbool64 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + using value_type = int64_t; + using vec_internal_type = vint64; + using vec_internal_mask_type = vbool64; + static constexpr int size() { + return 4; + } + Vec256() {} + C10_ALWAYS_INLINE Vec256(vint64 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool64 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vint64 v1, vint64 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool64 v1, vbool64 v2) : _vecb0{v1}, _vecb1{v2} {} + C10_ALWAYS_INLINE Vec256(int64_t scalar) + : _vec0{vec_splats(scalar)}, _vec1{vec_splats(scalar)} {} + C10_ALWAYS_INLINE Vec256( + int64_t scalar1, + int64_t scalar2, + int64_t scalar3, + int64_t scalar4) + : _vec0{vint64{scalar1, scalar2}}, _vec1{vint64{scalar3, scalar4}} {} + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return a; + } + + template + static std::enable_if_t> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return {b._vec0, a._vec1}; + } + + template + static std::enable_if_t<(mask & 15) == 15, Vec256> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + return b; + } + + template + static std::enable_if_t<(mask > 0 && mask < 3), Vec256> C10_ALWAYS_INLINE + blend(const Vec256& a, const Vec256& b) { + constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + const vbool64 mask_1st = (vbool64){g0, g1}; + return {(vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st), a._vec1}; + } + + template + static std::enable_if_t<(mask > 3) && (mask & 3) == 0, Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff; + constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff; + + const vbool64 mask_2nd = (vbool64){g0_2, g1_2}; + return {a._vec0, (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)}; + } + + template + static std::enable_if_t< + (mask > 3) && (mask & 3) != 0 && (mask & 15) != 15, + Vec256> + C10_ALWAYS_INLINE blend(const Vec256& a, const Vec256& b) { + constexpr uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + constexpr uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + constexpr uint64_t g0_2 = ((mask & 4) >> 2) * 0xffffffffffffffff; + constexpr uint64_t g1_2 = ((mask & 8) >> 3) * 0xffffffffffffffff; + + const vbool64 mask_1st = (vbool64){g0, g1}; + const vbool64 mask_2nd = (vbool64){g0_2, g1_2}; + return { + (vint64)vec_sel(a._vec0, b._vec0, (vbool64)mask_1st), + (vint64)vec_sel(a._vec1, b._vec1, (vbool64)mask_2nd)}; + } + + static Vec256 C10_ALWAYS_INLINE blendv( + const Vec256& a, + const Vec256& b, + const Vec256& mask) { + // the mask used here returned by comparision of vec256 + + return { + vec_sel(a._vec0, b._vec0, mask._vecb0), + vec_sel(a._vec1, b._vec1, mask._vecb1)}; + } + static Vec256 arange(int64_t base = 0., int64_t step = 1.) { + return Vec256(base, base + step, base + 2 * step, base + 3 * step); + } + + static Vec256 C10_ALWAYS_INLINE + set(const Vec256& a, + const Vec256& b, + size_t count = size()) { + switch (count) { + case 0: + return a; + case 1: + return blend<1>(a, b); + case 2: + return blend<3>(a, b); + case 3: + return blend<7>(a, b); + } + + return b; + } + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + static_assert(sizeof(double) == sizeof(value_type)); + const double* dptr = reinterpret_cast(ptr); + return {// treat it as double load + (vint64)vec_vsx_ld(offset0, dptr), + (vint64)vec_vsx_ld(offset16, dptr)}; + } + + __at_align32__ double tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return { + (vint64)vec_vsx_ld(offset0, tmp_values), + (vint64)vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + double* dptr = reinterpret_cast(ptr); + vec_vsx_st((vfloat64)_vec0, offset0, dptr); + vec_vsx_st((vfloat64)_vec1, offset16, dptr); + } else if (count > 0) { + __at_align32__ double tmp_values[size()]; + vec_vsx_st((vfloat64)_vec0, offset0, tmp_values); + vec_vsx_st((vfloat64)_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + const int64_t& operator[](int idx) const = delete; + int64_t& operator[](int idx) = delete; + + Vec256 angle() const { + return Vec256{0}; + } + Vec256 real() const { + return *this; + } + Vec256 imag() const { + return Vec256{0}; + } + Vec256 conj() const { + return *this; + } + + Vec256 C10_ALWAYS_INLINE abs() const { + return {vec_abs(_vec0), vec_abs(_vec1)}; + } + + Vec256 C10_ALWAYS_INLINE neg() const { + return {vec_neg(_vec0), vec_neg(_vec1)}; + } + + DEFINE_MEMBER_UNARY_OP(operator~, int64_t, vec_not) + DEFINE_MEMBER_OP(operator==, int64_t, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, int64_t, vec_cmpne) + DEFINE_MEMBER_OP(operator<, int64_t, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, int64_t, vec_cmple) + DEFINE_MEMBER_OP(operator>, int64_t, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, int64_t, vec_cmpge) + DEFINE_MEMBER_OP_AND_ONE(eq, int64_t, vec_cmpeq) + DEFINE_MEMBER_OP_AND_ONE(ne, int64_t, vec_cmpne) + DEFINE_MEMBER_OP_AND_ONE(lt, int64_t, vec_cmplt) + DEFINE_MEMBER_OP_AND_ONE(le, int64_t, vec_cmple) + DEFINE_MEMBER_OP_AND_ONE(gt, int64_t, vec_cmpgt) + DEFINE_MEMBER_OP_AND_ONE(ge, int64_t, vec_cmpge) + DEFINE_MEMBER_OP(operator+, int64_t, vec_add) + DEFINE_MEMBER_OP(operator-, int64_t, vec_sub) + DEFINE_MEMBER_OP(operator*, int64_t, vec_mul) + DEFINE_MEMBER_OP(operator/, int64_t, vec_div) + DEFINE_MEMBER_OP(maximum, int64_t, vec_max) + DEFINE_MEMBER_OP(minimum, int64_t, vec_min) + DEFINE_MEMBER_OP(operator&, int64_t, vec_and) + DEFINE_MEMBER_OP(operator|, int64_t, vec_or) + DEFINE_MEMBER_OP(operator^, int64_t, vec_xor) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h new file mode 100644 index 000000000000..a47e295ce03b --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_qint32_vsx.h @@ -0,0 +1,242 @@ +#pragma once + +#include +#include +#include +#include +#include + +// This file defines Vec256<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vec256, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vec256 -> 1x Vec256 +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over Vec256::float_num_vecs +// iterations. + +namespace at { +namespace vec256 { +namespace { + +template <> +struct Vec256 { + private: + union { + struct { + vint32 _vec0; + vint32 _vec1; + }; + struct { + vbool32 _vecb0; + vbool32 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + Vec256() {} + + static constexpr int size() { + return 8; + } + + static constexpr size_t float_num_vecs() { + return 1; + } + static constexpr int int_num_vecs() { + return 1; + } + using float_vec_return_type = std::array, 1>; + using int_vec_return_type = std::array, 1>; + using value_type = c10::qint32::underlying; + using vec_internal_type = vint32; + using vec_internal_mask_type = vbool32; + C10_ALWAYS_INLINE Vec256(vint32 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool32 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vint32 v1, vint32 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool32 v1, vbool32 v2) : _vecb0{v1}, _vecb1{v2} {} + + Vec256(const c10::qint32& val) + : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {} + + static Vec256 C10_ALWAYS_INLINE + loadu(const void* ptr, int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + float_vec_return_type dequantize( + Vec256 scale, + Vec256 zero_point, + Vec256 scale_zp_premul) const { + vfloat32 float_vals0 = vec_float(_vec0); + vfloat32 float_vals1 = vec_float(_vec1); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + vfloat32 scale_zp_premul0 = scale_zp_premul.vec0(); + vfloat32 scale_zp_premul1 = scale_zp_premul.vec1(); + return {Vec256{ + vec_madd(scale_vec0, float_vals0, scale_zp_premul0), + vec_madd(scale_vec1, float_vals1, scale_zp_premul1)}}; + } + + static Vec256 quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + Vec256 retval; + + const vint32 vmin = vec_splats(std::numeric_limits::min()); + const vint32 vmax = vec_splats(std::numeric_limits::max()); + vfloat32 inverse_scale_v = vec_splats(inverse_scale); + vfloat32 vec_zero_point = vec_splats((float)(zero_point)); + Vec256 vf0 = rhs[0]; + + vfloat32 vecf0 = vf0.vec0(); + vfloat32 vecf1 = vf0.vec1(); + vecf0 = vec_mul(vecf0, inverse_scale_v); + vecf1 = vec_mul(vecf1, inverse_scale_v); + vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); + vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + + veci0 = vec_max(veci0, vmin); + veci1 = vec_max(veci1, vmin); + veci0 = vec_min(veci0, vmax); + veci1 = vec_min(veci1, vmax); + + return {veci0, veci1}; + } + + Vec256 relu(Vec256 zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vec256 relu6( + Vec256 zero_point, + Vec256 q_six) const { + vint32 max0 = vec_max(_vec0, zero_point._vec0); + vint32 max1 = vec_max(_vec1, zero_point._vec1); + return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; + } + + int_vec_return_type widening_subtract(Vec256 b) const { + return {*this - b}; + } + + static Vec256 requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + const vint32 vmin = vec_splats(std::numeric_limits::min()); + const vint32 vmax = vec_splats(std::numeric_limits::max()); + vfloat32 vec_mult = vec_splats(multiplier); + vint32 vec_zero_point = vec_splats(zero_point); + Vec256 vi = inp[0]; + vfloat32 vecf0 = vec_float(vi.vec0()); + vfloat32 vecf1 = vec_float(vi.vec1()); + + vecf0 = vec_mul(vecf0, vec_mult); + vecf1 = vec_mul(vecf1, vec_mult); + + vecf0 = vec_rint(vecf0); + vecf1 = vec_rint(vecf1); + + vint32 veci0 = vec_add(vec_signed(vecf0),vec_zero_point); + vint32 veci1 = vec_add(vec_signed(vecf1),vec_zero_point); + + veci0 = vec_max(veci0, vmin); + veci1 = vec_max(veci1, vmin); + veci0 = vec_min(veci0, vmax); + veci1 = vec_min(veci1, vmax); + + return {veci0, veci1}; + } + + void dump() const { + std::cout << _vec0[0] << " "; + std::cout << _vec0[1] << " "; + std::cout << _vec0[2] << " "; + std::cout << _vec0[3] << " "; + std::cout << _vec1[0] << " "; + std::cout << _vec1[1] << " "; + std::cout << _vec1[2] << " "; + std::cout << _vec1[3] << " "; + std::cout << std::endl; + } + + DEFINE_MEMBER_OP(operator==, c10::qint32, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, c10::qint32, vec_cmpne) + DEFINE_MEMBER_OP(operator<, c10::qint32, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, c10::qint32, vec_cmple) + DEFINE_MEMBER_OP(operator>, c10::qint32, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, c10::qint32, vec_cmpge) + DEFINE_MEMBER_OP(operator+, c10::qint32, vec_add) + DEFINE_MEMBER_OP(operator-, c10::qint32, vec_sub) + DEFINE_MEMBER_OP(operator*, c10::qint32, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint32, /) + DEFINE_MEMBER_OP(maximum, c10::qint32, vec_max) + DEFINE_MEMBER_OP(minimum, c10::qint32, vec_min) + DEFINE_MEMBER_OP(operator&, c10::qint32, vec_and) + DEFINE_MEMBER_OP(operator|, c10::qint32, vec_or) + DEFINE_MEMBER_OP(operator^, c10::qint32, vec_xor) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h new file mode 100644 index 000000000000..f8b6eced60ef --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_qint8_vsx.h @@ -0,0 +1,404 @@ +#pragma once + +#include +#include +#include +#include +#include + +// This file defines Vec256<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vec256, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vec256 -> 4x Vec256 +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over Vec256::float_num_vecs +// iterations. + +namespace at { +namespace vec256 { +namespace { + +template <> +struct Vec256 { + private: + union { + struct { + vint8 _vec0; + vint8 _vec1; + }; + struct { + vbool8 _vecb0; + vbool8 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + Vec256() {} + static constexpr int size() { + return 32; + } + + static constexpr size_t float_num_vecs() { + return 4; + } + static constexpr int int_num_vecs() { + return 4; + } + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = typename c10::qint8::underlying; + using vec_internal_type = vint8; + using vec_internal_mask_type = vbool8; + // Broadcast constructor + C10_ALWAYS_INLINE Vec256(const c10::qint8& val) + : _vec0{vec_splats(val.val_)}, _vec1{vec_splats(val.val_)} {} + + C10_ALWAYS_INLINE Vec256(const Vec256& other) + : _vec0{other._vec0}, _vec1(other._vec1) {} + + C10_ALWAYS_INLINE Vec256(vint8 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vint8 v1, vint8 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {} + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + static C10_ALWAYS_INLINE Vec256 loadu( + const void* ptr, + int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + public: + float_vec_return_type C10_ALWAYS_INLINE dequantize( + Vec256 scale, + Vec256 zero_point, + Vec256 scale_zp_premul) const { + vint16 vecshi0 = vec_unpackh(_vec0); + vint16 vecshi1 = vec_unpackl(_vec0); + + vint16 vecshi2 = vec_unpackh(_vec1); + vint16 vecshi3 = vec_unpackl(_vec1); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 veci1 = vec_unpackl(vecshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 veci3 = vec_unpackl(vecshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 veci5 = vec_unpackl(vecshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 veci7 = vec_unpackl(vecshi3); + + vfloat32 vecf0_0 = vec_float(veci0); + vfloat32 vecf1_0 = vec_float(veci1); + + vfloat32 vecf0_1 = vec_float(veci2); + vfloat32 vecf1_1 = vec_float(veci3); + + vfloat32 vecf0_2 = vec_float(veci4); + vfloat32 vecf1_2 = vec_float(veci5); + + vfloat32 vecf0_3 = vec_float(veci6); + vfloat32 vecf1_3 = vec_float(veci7); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + vfloat32 scale_zp_premul0 = scale_zp_premul.vec0(); + vfloat32 scale_zp_premul1 = scale_zp_premul.vec1(); + return { + Vec256{ + vec_madd(scale_vec0, vecf0_0, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_0, scale_zp_premul1)}, + Vec256{ + vec_madd(scale_vec0, vecf0_1, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_1, scale_zp_premul1)}, + Vec256{ + vec_madd(scale_vec0, vecf0_2, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_2, scale_zp_premul1)}, + Vec256{ + vec_madd(scale_vec0, vecf0_3, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_3, scale_zp_premul1)}}; + } + + static Vec256 quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + // constexpr int32_t min_val = std::numeric_limits::min(); + // constexpr int32_t max_val = std::numeric_limits::max(); + + vfloat32 inverse_scale_v = vec_splats(inverse_scale); + vfloat32 vec_zero_point = vec_splats((float)zero_point); + // vint32 vmin = vec_splats(min_val); + // vint32 vmax = vec_splats(max_val); + + Vec256 vf0 = rhs[0]; + Vec256 vf1 = rhs[1]; + Vec256 vf2 = rhs[2]; + Vec256 vf3 = rhs[3]; + vfloat32 vecf0 = vf0.vec0(); + vfloat32 vecf1 = vf0.vec1(); + vfloat32 vecf2 = vf1.vec0(); + vfloat32 vecf3 = vf1.vec1(); + + vfloat32 vecf4 = vf2.vec0(); + vfloat32 vecf5 = vf2.vec1(); + vfloat32 vecf6 = vf3.vec0(); + vfloat32 vecf7 = vf3.vec1(); + + vecf0 = vec_mul(vecf0, inverse_scale_v); + vecf1 = vec_mul(vecf1, inverse_scale_v); + vecf2 = vec_mul(vecf2, inverse_scale_v); + vecf3 = vec_mul(vecf3, inverse_scale_v); + + vecf4 = vec_mul(vecf4, inverse_scale_v); + vecf5 = vec_mul(vecf5, inverse_scale_v); + vecf6 = vec_mul(vecf6, inverse_scale_v); + vecf7 = vec_mul(vecf7, inverse_scale_v); + + vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); + vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); + vecf2 = vec_add(vec_rint(vecf2), vec_zero_point); + vecf3 = vec_add(vec_rint(vecf3), vec_zero_point); + + vecf4 = vec_add(vec_rint(vecf4), vec_zero_point); + vecf5 = vec_add(vec_rint(vecf5), vec_zero_point); + vecf6 = vec_add(vec_rint(vecf6), vec_zero_point); + vecf7 = vec_add(vec_rint(vecf7), vec_zero_point); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + // veci0 = vec_min(vmax, vec_max( vmin, vecf0)) ; + // veci1 = vec_min(vmax, vec_max( vmin, vecf1)) ; + // veci2 = vec_min(vmax, vec_max( vmin, vecf2)) ; + // veci3 = vec_min(vmax, vec_max( vmin, vecf3)) ; + + // veci4 = vec_min(vmax, vec_max( vmin, vecf4)) ; + // veci5 = vec_min(vmax, vec_max( vmin, vecf5)) ; + // veci6 = vec_min(vmax, vec_max( vmin, vecf6)) ; + // veci7 = vec_min(vmax, vec_max( vmin, vecf7)) ; + // vec_packs CLAMP already + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vint8 vec0 = vec_packs(vecshi0, vecshi1); + vint8 vec1 = vec_packs(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + Vec256 C10_ALWAYS_INLINE relu(Vec256 zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vec256 C10_ALWAYS_INLINE + relu6(Vec256 zero_point, Vec256 q_six) const { + vint8 max0 = vec_max(_vec0, zero_point._vec0); + vint8 max1 = vec_max(_vec1, zero_point._vec1); + return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; + } + + int_vec_return_type widening_subtract(Vec256 b) const { + vint16 vecshi0 = vec_unpackh(_vec0); + vint16 vecBshi0 = vec_unpackh(b._vec0); + vint16 vecshi1 = vec_unpackl(_vec0); + vint16 vecBshi1 = vec_unpackl(b._vec0); + + vint16 vecshi2 = vec_unpackh(_vec1); + vint16 vecBshi2 = vec_unpackh(b._vec1); + vint16 vecshi3 = vec_unpackl(_vec1); + vint16 vecBshi3 = vec_unpackl(b._vec1); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 vecBi0 = vec_unpackh(vecBshi0); + vint32 veci1 = vec_unpackl(vecshi0); + vint32 vecBi1 = vec_unpackl(vecBshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 vecBi2 = vec_unpackh(vecBshi1); + vint32 veci3 = vec_unpackl(vecshi1); + vint32 vecBi3 = vec_unpackl(vecBshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 vecBi4 = vec_unpackh(vecBshi2); + vint32 veci5 = vec_unpackl(vecshi2); + vint32 vecBi5 = vec_unpackl(vecBshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 vecBi6 = vec_unpackh(vecBshi3); + vint32 veci7 = vec_unpackl(vecshi3); + vint32 vecBi7 = vec_unpackl(vecBshi3); + + return { + Vec256(veci0 - vecBi0, veci1 - vecBi1), + Vec256(veci2 - vecBi2, veci3 - vecBi3), + Vec256(veci4 - vecBi4, veci5 - vecBi5), + Vec256(veci6 - vecBi6, veci7 - vecBi7)}; + } + + static Vec256 requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + vfloat32 vec_multiplier = vec_splats(multiplier); + vint32 vec_zero_point = vec_splats(zero_point); + + Vec256 vi0 = inp[0]; + Vec256 vi1 = inp[1]; + Vec256 vi2 = inp[2]; + Vec256 vi3 = inp[3]; + + vfloat32 vecf0 = vec_float(vi0.vec0()); + vfloat32 vecf1 = vec_float(vi0.vec1()); + vfloat32 vecf2 = vec_float(vi1.vec0()); + vfloat32 vecf3 = vec_float(vi1.vec1()); + + vfloat32 vecf4 = vec_float(vi2.vec0()); + vfloat32 vecf5 = vec_float(vi2.vec1()); + vfloat32 vecf6 = vec_float(vi3.vec0()); + vfloat32 vecf7 = vec_float(vi3.vec1()); + + vecf0 = vec_mul(vecf0, vec_multiplier); + vecf1 = vec_mul(vecf1, vec_multiplier); + vecf2 = vec_mul(vecf2, vec_multiplier); + vecf3 = vec_mul(vecf3, vec_multiplier); + + vecf4 = vec_mul(vecf4, vec_multiplier); + vecf5 = vec_mul(vecf5, vec_multiplier); + vecf6 = vec_mul(vecf6, vec_multiplier); + vecf7 = vec_mul(vecf7, vec_multiplier); + + vecf0 = vec_rint(vecf0); + vecf1 = vec_rint(vecf1); + vecf2 = vec_rint(vecf2); + vecf3 = vec_rint(vecf3); + + vecf4 = vec_rint(vecf4); + vecf5 = vec_rint(vecf5); + vecf6 = vec_rint(vecf6); + vecf7 = vec_rint(vecf7); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + veci0 = vec_add(veci0, vec_zero_point); + veci1 = vec_add(veci1, vec_zero_point); + veci2 = vec_add(veci2, vec_zero_point); + veci3 = vec_add(veci3, vec_zero_point); + + veci4 = vec_add(veci4, vec_zero_point); + veci5 = vec_add(veci5, vec_zero_point); + veci6 = vec_add(veci6, vec_zero_point); + veci7 = vec_add(veci7, vec_zero_point); + + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vint8 vec0 = vec_packs(vecshi0, vecshi1); + vint8 vec1 = vec_packs(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + void dump() const { + value_type vals[size()]; + store((void*)vals); + for (int i = 0; i < size(); ++i) { + std::cout << (int)(vals[i]) << " "; + } + std::cout << std::endl; + } + + DEFINE_MEMBER_OP(operator==, c10::qint8, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, c10::qint8, vec_cmpne) + DEFINE_MEMBER_OP(operator<, c10::qint8, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, c10::qint8, vec_cmple) + DEFINE_MEMBER_OP(operator>, c10::qint8, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, c10::qint8, vec_cmpge) + DEFINE_MEMBER_OP(operator+, c10::qint8, vec_add) + DEFINE_MEMBER_OP(operator-, c10::qint8, vec_sub) + DEFINE_MEMBER_OP(operator*, c10::qint8, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::qint8, /) + DEFINE_MEMBER_OP(maximum, c10::qint8, vec_max) + DEFINE_MEMBER_OP(minimum, c10::qint8, vec_min) + DEFINE_MEMBER_OP(operator&, c10::qint8, vec_and) + DEFINE_MEMBER_OP(operator|, c10::qint8, vec_or) + DEFINE_MEMBER_OP(operator^, c10::qint8, vec_xor) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h new file mode 100644 index 000000000000..96809ce32593 --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vec256_quint8_vsx.h @@ -0,0 +1,413 @@ +#pragma once + +#include +#include +#include +#include +#include + +// This file defines Vec256<> for the quantized types. +// +// +// Currently, we simply use these classes as efficient converters between +// the quantized types and Vec256, usually in bandwidth-bound cases +// where doing the arithmetic in full-precision is acceptable (e.g. +// elementwise operators). +// +// +// Conversions are as follows: +// Vec256 -> 4x Vec256 +// +// The size of the returned float vector is specified by the special +// constexpr function float_num_vecs. The type of the value returned +// from dequantize (and expected as an argument to quantize) is +// specified by float_vec_return_type. +// +// When writing kernels with these vectors, it is expected that floating- +// point operations will be carried out in a loop over Vec256::float_num_vecs +// iterations. + +namespace at { +namespace vec256 { +namespace { + +const vint16 mask_unsigned = vec_splats((short int)0xFF); +template <> +struct Vec256 { + private: + union { + struct { + vuint8 _vec0; + vuint8 _vec1; + }; + struct { + vbool8 _vecb0; + vbool8 _vecb1; + }; + + } __attribute__((__may_alias__)); + + public: + Vec256() {} + static constexpr int size() { + return 32; + } + + static constexpr size_t float_num_vecs() { + return 4; + } + static constexpr int int_num_vecs() { + return 4; + } + using float_vec_return_type = std::array, 4>; + using int_vec_return_type = std::array, 4>; + using value_type = typename c10::quint8::underlying; + using vec_internal_type = vuint8; + using vec_internal_mask_type = vbool8; + // Broadcast constructor + C10_ALWAYS_INLINE Vec256(const c10::quint8& val) + : _vec0(vec_splats(val.val_)), _vec1(vec_splats(val.val_)) {} + + C10_ALWAYS_INLINE Vec256(const Vec256& other) + : _vec0{other._vec0}, _vec1(other._vec1) {} + + C10_ALWAYS_INLINE Vec256(vuint8 v) : _vec0{v}, _vec1{v} {} + C10_ALWAYS_INLINE Vec256(vbool8 vmask) : _vecb0{vmask}, _vecb1{vmask} {} + C10_ALWAYS_INLINE Vec256(vuint8 v1, vuint8 v2) : _vec0{v1}, _vec1{v2} {} + C10_ALWAYS_INLINE Vec256(vbool8 v1, vbool8 v2) : _vecb0{v1}, _vecb1{v2} {} + + C10_ALWAYS_INLINE const vec_internal_type& vec0() const { + return _vec0; + } + C10_ALWAYS_INLINE const vec_internal_type& vec1() const { + return _vec1; + } + + static C10_ALWAYS_INLINE Vec256 loadu( + const void* ptr, + int count = size()) { + if (count == size()) { + return { + vec_vsx_ld(offset0, reinterpret_cast(ptr)), + vec_vsx_ld(offset16, reinterpret_cast(ptr))}; + } + __at_align32__ value_type tmp_values[size()]; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); + return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; + } + void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + if (count == size()) { + vec_vsx_st(_vec0, offset0, reinterpret_cast(ptr)); + vec_vsx_st(_vec1, offset16, reinterpret_cast(ptr)); + } else if (count > 0) { + __at_align32__ value_type tmp_values[size()]; + vec_vsx_st(_vec0, offset0, tmp_values); + vec_vsx_st(_vec1, offset16, tmp_values); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(value_type)); + } + } + + public: + float_vec_return_type C10_ALWAYS_INLINE dequantize( + Vec256 scale, + Vec256 zero_point, + Vec256 scale_zp_premul) const { + // unpacking unsigned as signed + vint16 vecshi0 = vec_unpackh((vint8)_vec0); + vint16 vecshi1 = vec_unpackl((vint8)_vec0); + + vint16 vecshi2 = vec_unpackh((vint8)_vec1); + vint16 vecshi3 = vec_unpackl((vint8)_vec1); + + // signed -> unsigned + vecshi0 = vec_and(vecshi0, mask_unsigned); + vecshi1 = vec_and(vecshi1, mask_unsigned); + + vecshi2 = vec_and(vecshi2, mask_unsigned); + vecshi3 = vec_and(vecshi3, mask_unsigned); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 veci1 = vec_unpackl(vecshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 veci3 = vec_unpackl(vecshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 veci5 = vec_unpackl(vecshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 veci7 = vec_unpackl(vecshi3); + + vfloat32 vecf0_0 = vec_float(veci0); + vfloat32 vecf1_0 = vec_float(veci1); + + vfloat32 vecf0_1 = vec_float(veci2); + vfloat32 vecf1_1 = vec_float(veci3); + + vfloat32 vecf0_2 = vec_float(veci4); + vfloat32 vecf1_2 = vec_float(veci5); + + vfloat32 vecf0_3 = vec_float(veci6); + vfloat32 vecf1_3 = vec_float(veci7); + vfloat32 scale_vec0 = scale.vec0(); + vfloat32 scale_vec1 = scale.vec1(); + vfloat32 scale_zp_premul0 = scale_zp_premul.vec0(); + vfloat32 scale_zp_premul1 = scale_zp_premul.vec1(); + return { + Vec256{ + vec_madd(scale_vec0, vecf0_0, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_0, scale_zp_premul1)}, + Vec256{ + vec_madd(scale_vec0, vecf0_1, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_1, scale_zp_premul1)}, + Vec256{ + vec_madd(scale_vec0, vecf0_2, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_2, scale_zp_premul1)}, + Vec256{ + vec_madd(scale_vec0, vecf0_3, scale_zp_premul0), + vec_madd(scale_vec1, vecf1_3, scale_zp_premul1)}}; + } + + static Vec256 quantize( + const float_vec_return_type& rhs, + float scale, + int32_t zero_point, + float inverse_scale) { + // constexpr int32_t min_val = std::numeric_limits::min(); + // constexpr int32_t max_val = std::numeric_limits::max(); + + vfloat32 vec_inverse = vec_splats(inverse_scale); + vfloat32 vec_zero_point = vec_splats((float)zero_point); + // vuint32 vmin = vec_splats(min_val); + // vuint32 vmax = vec_splats(max_val); + Vec256 vf0 = rhs[0]; + Vec256 vf1 = rhs[1]; + Vec256 vf2 = rhs[2]; + Vec256 vf3 = rhs[3]; + vfloat32 vecf0 = vf0.vec0(); + vfloat32 vecf1 = vf0.vec1(); + vfloat32 vecf2 = vf1.vec0(); + vfloat32 vecf3 = vf1.vec1(); + + vfloat32 vecf4 = vf2.vec0(); + vfloat32 vecf5 = vf2.vec1(); + vfloat32 vecf6 = vf3.vec0(); + vfloat32 vecf7 = vf3.vec1(); + + vecf0 = vec_mul(vecf0, vec_inverse); + vecf1 = vec_mul(vecf1, vec_inverse); + vecf2 = vec_mul(vecf2, vec_inverse); + vecf3 = vec_mul(vecf3, vec_inverse); + + vecf4 = vec_mul(vecf4, vec_inverse); + vecf5 = vec_mul(vecf5, vec_inverse); + vecf6 = vec_mul(vecf6, vec_inverse); + vecf7 = vec_mul(vecf7, vec_inverse); + + vecf0 = vec_add(vec_rint(vecf0), vec_zero_point); + vecf1 = vec_add(vec_rint(vecf1), vec_zero_point); + vecf2 = vec_add(vec_rint(vecf2), vec_zero_point); + vecf3 = vec_add(vec_rint(vecf3), vec_zero_point); + + vecf4 = vec_add(vec_rint(vecf4), vec_zero_point); + vecf5 = vec_add(vec_rint(vecf5), vec_zero_point); + vecf6 = vec_add(vec_rint(vecf6), vec_zero_point); + vecf7 = vec_add(vec_rint(vecf7), vec_zero_point); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vuint8 vec0 = vec_packsu(vecshi0, vecshi1); + vuint8 vec1 = vec_packsu(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + Vec256 C10_ALWAYS_INLINE relu(Vec256 zero_point) const { + return {vec_max(_vec0, zero_point._vec0), vec_max(_vec1, zero_point._vec1)}; + } + + Vec256 C10_ALWAYS_INLINE + relu6(Vec256 zero_point, Vec256 q_six) const { + vuint8 max0 = vec_max(_vec0, zero_point._vec0); + vuint8 max1 = vec_max(_vec1, zero_point._vec1); + return {vec_min(max0, q_six._vec0), vec_min(max1, q_six._vec1)}; + } + + int_vec_return_type widening_subtract(Vec256 b) const { + vint16 vecshi0 = vec_unpackh((vint8)_vec0); + vint16 vecBshi0 = vec_unpackh((vint8)b._vec0); + vint16 vecshi1 = vec_unpackl((vint8)_vec0); + vint16 vecBshi1 = vec_unpackl((vint8)b._vec0); + + vint16 vecshi2 = vec_unpackh((vint8)_vec1); + vint16 vecBshi2 = vec_unpackh((vint8)b._vec1); + vint16 vecshi3 = vec_unpackl((vint8)_vec1); + vint16 vecBshi3 = vec_unpackl((vint8)b._vec1); + + vecshi0 = vec_and(vecshi0, mask_unsigned); + vecBshi0 = vec_and(vecBshi0, mask_unsigned); + vecshi1 = vec_and(vecshi1, mask_unsigned); + vecBshi1 = vec_and(vecBshi1, mask_unsigned); + + vecshi2 = vec_and(vecshi2, mask_unsigned); + vecBshi2 = vec_and(vecBshi2, mask_unsigned); + vecshi3 = vec_and(vecshi3, mask_unsigned); + vecBshi3 = vec_and(vecBshi3, mask_unsigned); + + vint32 veci0 = vec_unpackh(vecshi0); + vint32 vecBi0 = vec_unpackh(vecBshi0); + vint32 veci1 = vec_unpackl(vecshi0); + vint32 vecBi1 = vec_unpackl(vecBshi0); + + vint32 veci2 = vec_unpackh(vecshi1); + vint32 vecBi2 = vec_unpackh(vecBshi1); + vint32 veci3 = vec_unpackl(vecshi1); + vint32 vecBi3 = vec_unpackl(vecBshi1); + + vint32 veci4 = vec_unpackh(vecshi2); + vint32 vecBi4 = vec_unpackh(vecBshi2); + vint32 veci5 = vec_unpackl(vecshi2); + vint32 vecBi5 = vec_unpackl(vecBshi2); + + vint32 veci6 = vec_unpackh(vecshi3); + vint32 vecBi6 = vec_unpackh(vecBshi3); + vint32 veci7 = vec_unpackl(vecshi3); + vint32 vecBi7 = vec_unpackl(vecBshi3); + + return { + Vec256(veci0 - vecBi0, veci1 - vecBi1), + Vec256(veci2 - vecBi2, veci3 - vecBi3), + Vec256(veci4 - vecBi4, veci5 - vecBi5), + Vec256(veci6 - vecBi6, veci7 - vecBi7)}; + } + + static Vec256 requantize_from_int( + const int_vec_return_type& inp, + float multiplier, + int32_t zero_point) { + vfloat32 vec_multiplier = vec_splats(multiplier); + vint32 vec_zero_point = vec_splats(zero_point); + + Vec256 vi0 = inp[0]; + Vec256 vi1 = inp[1]; + Vec256 vi2 = inp[2]; + Vec256 vi3 = inp[3]; + + vfloat32 vecf0 = vec_float(vi0.vec0()); + vfloat32 vecf1 = vec_float(vi0.vec1()); + vfloat32 vecf2 = vec_float(vi1.vec0()); + vfloat32 vecf3 = vec_float(vi1.vec1()); + + vfloat32 vecf4 = vec_float(vi2.vec0()); + vfloat32 vecf5 = vec_float(vi2.vec1()); + vfloat32 vecf6 = vec_float(vi3.vec0()); + vfloat32 vecf7 = vec_float(vi3.vec1()); + + vecf0 = vec_mul(vecf0, vec_multiplier); + vecf1 = vec_mul(vecf1, vec_multiplier); + vecf2 = vec_mul(vecf2, vec_multiplier); + vecf3 = vec_mul(vecf3, vec_multiplier); + + vecf4 = vec_mul(vecf4, vec_multiplier); + vecf5 = vec_mul(vecf5, vec_multiplier); + vecf6 = vec_mul(vecf6, vec_multiplier); + vecf7 = vec_mul(vecf7, vec_multiplier); + + vecf0 = vec_rint(vecf0); + vecf1 = vec_rint(vecf1); + vecf2 = vec_rint(vecf2); + vecf3 = vec_rint(vecf3); + + vecf4 = vec_rint(vecf4); + vecf5 = vec_rint(vecf5); + vecf6 = vec_rint(vecf6); + vecf7 = vec_rint(vecf7); + + vint32 veci0 = vec_signed(vecf0); + vint32 veci1 = vec_signed(vecf1); + vint32 veci2 = vec_signed(vecf2); + vint32 veci3 = vec_signed(vecf3); + + vint32 veci4 = vec_signed(vecf4); + vint32 veci5 = vec_signed(vecf5); + vint32 veci6 = vec_signed(vecf6); + vint32 veci7 = vec_signed(vecf7); + + veci0 = vec_add(veci0, vec_zero_point); + veci1 = vec_add(veci1, vec_zero_point); + veci2 = vec_add(veci2, vec_zero_point); + veci3 = vec_add(veci3, vec_zero_point); + + veci4 = vec_add(veci4, vec_zero_point); + veci5 = vec_add(veci5, vec_zero_point); + veci6 = vec_add(veci6, vec_zero_point); + veci7 = vec_add(veci7, vec_zero_point); + + vint16 vecshi0 = vec_packs(veci0, veci1); + vint16 vecshi1 = vec_packs(veci2, veci3); + vint16 vecshi2 = vec_packs(veci4, veci5); + vint16 vecshi3 = vec_packs(veci6, veci7); + + vuint8 vec0 = vec_packsu(vecshi0, vecshi1); + vuint8 vec1 = vec_packsu(vecshi2, vecshi3); + + return {vec0, vec1}; + } + + void dump() const { + value_type vals[size()]; + store((void*)vals); + for (int i = 0; i < size(); ++i) { + std::cout << (int)(vals[i]) << " "; + } + std::cout << std::endl; + } + + DEFINE_MEMBER_OP(operator==, c10::quint8, vec_cmpeq) + DEFINE_MEMBER_OP(operator!=, c10::quint8, vec_cmpne) + DEFINE_MEMBER_OP(operator<, c10::quint8, vec_cmplt) + DEFINE_MEMBER_OP(operator<=, c10::quint8, vec_cmple) + DEFINE_MEMBER_OP(operator>, c10::quint8, vec_cmpgt) + DEFINE_MEMBER_OP(operator>=, c10::quint8, vec_cmpge) + DEFINE_MEMBER_OP(operator+, c10::quint8, vec_add) + DEFINE_MEMBER_OP(operator-, c10::quint8, vec_sub) + DEFINE_MEMBER_OP(operator*, c10::quint8, vec_mul) + DEFINE_MEMBER_EMULATE_BINARY_OP(operator/, c10::quint8, /) + DEFINE_MEMBER_OP(maximum, c10::quint8, vec_max) + DEFINE_MEMBER_OP(minimum, c10::quint8, vec_min) + DEFINE_MEMBER_OP(operator&, c10::quint8, vec_and) + DEFINE_MEMBER_OP(operator|, c10::quint8, vec_or) + DEFINE_MEMBER_OP(operator^, c10::quint8, vec_xor) +}; + +template <> +Vec256 inline maximum( + const Vec256& a, + const Vec256& b) { + return a.maximum(b); +} + +template <> +Vec256 inline minimum( + const Vec256& a, + const Vec256& b) { + return a.minimum(b); +} + +} // namespace +} // namespace vec256 +} // namespace at diff --git a/aten/src/ATen/cpu/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec256/vsx/vsx_helpers.h new file mode 100644 index 000000000000..40cb7ef7a66e --- /dev/null +++ b/aten/src/ATen/cpu/vec256/vsx/vsx_helpers.h @@ -0,0 +1,332 @@ +#pragma once +#include +#include +#include + +using vbool8 = __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char; +using vbool16 = __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short; +using vbool32 = __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) int; +using vbool64 = __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) long long; +using vint8 = __attribute__((altivec(vector__))) signed char; +using vint16 = __attribute__((altivec(vector__))) signed short; +using vint32 = __attribute__((altivec(vector__))) signed int; +using vint64 = __attribute__((altivec(vector__))) signed long long; +using vuint8 = __attribute__((altivec(vector__))) unsigned char; +using vuint16 = __attribute__((altivec(vector__))) unsigned short; +using vuint32 = __attribute__((altivec(vector__))) unsigned int; +using vuint64 = __attribute__((altivec(vector__))) unsigned long long; +using vfloat32 = __attribute__((altivec(vector__))) float; +using vfloat64 = __attribute__((altivec(vector__))) double; + +#if !defined(vec_float) +C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) { + vfloat32 vec_out; + __asm__("xvcvsxwsp %x0,%x1" : "=wf"(vec_out) : "wa"(vec_in)); + return vec_out; +} +#endif + +#define vec_not(a) vec_nor(a, a) + +#define DEFINE_MEMBER_UNARY_OP(op, op_type, func) \ + Vec256 C10_ALWAYS_INLINE op() const { \ + return Vec256{func(_vec0), func(_vec1)}; \ + } + +#define DEFINE_MEMBER_OP(op, op_type, func) \ + Vec256 C10_ALWAYS_INLINE op(const Vec256& other) const { \ + return Vec256{ \ + func(_vec0, other._vec0), func(_vec1, other._vec1)}; \ + } + +#define DEFINE_MEMBER_BITWISE_OP(op, op_type, func) \ + Vec256 C10_ALWAYS_INLINE op(const Vec256& other) const { \ + return Vec256{ \ + func(_vecb0, other._vecb0), func(_vecb1, other._vecb1)}; \ + } + +#define DEFINE_MEMBER_TERNARY_OP(op, op_type, func) \ + Vec256 C10_ALWAYS_INLINE op( \ + const Vec256& b, const Vec256& c) const { \ + return Vec256{ \ + func(_vec0, b._vec0, c._vec0), func(_vec1, b._vec1, c._vec1)}; \ + } + +#define DEFINE_MEMBER_EMULATE_BINARY_OP(op, op_type, binary_op) \ + Vec256 C10_ALWAYS_INLINE op(const Vec256& b) const { \ + Vec256::vec_internal_type ret_0; \ + Vec256::vec_internal_type ret_1; \ + for (int i = 0; i < Vec256::size() / 2; i++) { \ + ret_0[i] = _vec0[i] binary_op b._vec0[i]; \ + ret_1[i] = _vec1[i] binary_op b._vec1[i]; \ + } \ + return Vec256{ret_0, ret_1}; \ + } + + +#define DEFINE_MEMBER_OP_AND_ONE(op, op_type, func) \ + Vec256 C10_ALWAYS_INLINE op(const Vec256& other) const { \ + using vvtype = Vec256::vec_internal_type; \ + const vvtype v_one = vec_splats(static_cast(1.0)); \ + vvtype ret0 = (vvtype)func(_vec0, other._vec0); \ + vvtype ret1 = (vvtype)func(_vec1, other._vec1); \ + return Vec256{vec_and(ret0, v_one), vec_and(ret1, v_one)}; \ + } + +#define DEFINE_CLAMP_FUNCS(operand_type) \ + template <> \ + Vec256 C10_ALWAYS_INLINE clamp( \ + const Vec256& a, \ + const Vec256& min, \ + const Vec256& max) { \ + return Vec256{ \ + vec_min(max.vec0(), vec_max(a.vec0(), min.vec0())), \ + vec_min(max.vec1(), vec_max(a.vec1(), min.vec1()))}; \ + } \ + template <> \ + Vec256 C10_ALWAYS_INLINE clamp_min( \ + const Vec256& a, const Vec256& min) { \ + return Vec256{ \ + vec_max(a.vec0(), min.vec0()), vec_max(a.vec1(), min.vec1())}; \ + } \ + template <> \ + Vec256 C10_ALWAYS_INLINE clamp_max( \ + const Vec256& a, const Vec256& max) { \ + return Vec256{ \ + vec_min(a.vec0(), max.vec0()), vec_min(a.vec1(), max.vec1())}; \ + } + +#define DEFINE_REINTERPRET_CAST_FUNCS( \ + first_type, cast_type, cast_inner_vector_type) \ + template <> \ + C10_ALWAYS_INLINE Vec256 cast( \ + const Vec256& src) { \ + return Vec256{(cast_inner_vector_type)src.vec0(), \ + (cast_inner_vector_type)src.vec1()}; \ + } + +#define DEFINE_REINTERPRET_CAST_TO_ALL_FUNCS(first_type) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, double, vfloat64) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, float, vfloat32) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int64_t, vint64) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int32_t, vint32) \ + DEFINE_REINTERPRET_CAST_FUNCS(first_type, int16_t, vint16) + +// it can be used to emulate blend faster +constexpr int blendChoice(uint32_t mask, uint32_t half1 = 0xF, uint32_t half2 = 0xF0) { + uint32_t none = 0; + uint32_t both = half1 | half2; + // clamp it between 0 and both + mask = mask & both; + // return (a._vec0, a._vec1) + if (mask == none) return 0; + // return (b._vec0,b._vec1) + else if (mask == both) + return 1; + // return (b._vec0,a._vec1) + else if (mask == half1) + return 2; + // return (a._vec0,b._vec1) + else if (mask == half2) + return 3; + // return (*_vec0,a._vec1) + else if (mask > 0 && mask < half1) + return 4; + // return (*_vec0,b._vec1) + else if ((mask & half2) == half2) + return 5; + // return (a._vec0,*_vec1) + else if ((mask & half1) == 0 && mask > half1) + return 6; + // return (b._vec0,*_vec1) + else if ((mask & half1) == half1 && mask > half1) + return 7; + // return (*_vec0,*_vec1) + return 8; +} + +// it can be used to emulate blend faster +constexpr int blendChoiceDbl(uint32_t mask) { + // clamp it 0 and 0xF + return blendChoice(mask, 0x3, 0xC); +} + +constexpr vbool32 VsxMask1(uint32_t mask) { + uint32_t g0 = (mask & 1) * 0xffffffff; + uint32_t g1 = ((mask & 2) >> 1) * 0xffffffff; + uint32_t g2 = ((mask & 4) >> 2) * 0xffffffff; + uint32_t g3 = ((mask & 8) >> 3) * 0xffffffff; + return (vbool32){g0, g1, g2, g3}; +} + +constexpr vbool32 VsxMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xFF) >> 4; + return VsxMask1(mask2); +} + +constexpr vbool64 VsxDblMask1(uint32_t mask) { + uint64_t g0 = (mask & 1) * 0xffffffffffffffff; + uint64_t g1 = ((mask & 2) >> 1) * 0xffffffffffffffff; + return (vbool64){g0, g1}; +} + +constexpr vbool64 VsxDblMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xF) >> 2; + return VsxDblMask1(mask2); +} + +constexpr int maskForComplex(uint32_t mask) { + mask = mask & 0xF; + int complex_mask = 0; + if (mask & 1) complex_mask |= 3; + if (mask & 2) complex_mask |= (3 << 2); + if (mask & 4) complex_mask |= (3 << 4); + if (mask & 8) complex_mask |= (3 << 6); + return complex_mask; +} + +constexpr int maskForComplexDbl(uint32_t mask) { + mask = mask & 0x3; + int complex_mask = 0; + if (mask & 1) complex_mask |= 3; + if (mask & 2) complex_mask |= (3 << 2); + return complex_mask; +} + +constexpr int blendChoiceComplex(uint32_t mask) { + return blendChoice(maskForComplex(mask)); +} + +constexpr int blendChoiceComplexDbl(uint32_t mask) { + return blendChoiceDbl(maskForComplexDbl(mask)); +} + +constexpr vbool32 VsxComplexMask1(uint32_t mask) { + return VsxMask1(maskForComplex(mask)); +} + +constexpr vbool32 VsxComplexMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xF) >> 2; + return VsxMask1(maskForComplex(mask2)); +} + +constexpr vbool64 VsxComplexDblMask1(uint32_t mask) { return VsxDblMask1(mask); } + +constexpr vbool64 VsxComplexDblMask2(uint32_t mask) { + uint32_t mask2 = (mask & 0xF) >> 2; + return VsxDblMask1(mask2); +} + +// constants +namespace at { +namespace vec256 { +// See Note [Acceptable use of anonymous namespace in header] +namespace { +// + constexpr int offset0 = 0; + constexpr int offset16 = 16; + +//#Constants +const vuint8 mask_zero_bits = vuint8{128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 96, 64, 32, 0}; + +const vuint8 swap_mask = + vuint8{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; + +const vint32 v0x7f = vec_splats(0x7f); +const vint32 vi_0 = vec_splats((int)(0)); +const vint32 vi_1 = vec_splats((int)1); +const vint32 vi_2 = vec_splats((int)2); +const vint32 vi_4 = vec_splats((int)4); +const vint32 vi_inv1 = vec_splats((int)~1); +const vuint32 vu_29 = vec_splats(29u); +const vuint32 vu_23 = vec_splats(23u); + +const vbool32 inv_mant_mask = (vbool32)vec_splats((unsigned int)~0xff800000); +const vbool32 sign_mask = (vbool32)vec_splats((int)0x80000000); +const vbool32 real_mask = vbool32{0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0}; +const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF}; +const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000}; +const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0}; + +const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF}; +const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0}; +const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000}; +const vbool64 vd_rsign_mask = vbool64{0x8000000000000000, 0x0}; + +const vfloat32 zero = vec_splats(0.f); +const vfloat32 half = vec_splats(0.5f); +const vfloat32 one = vec_splats(1.f); +const vfloat32 two = vec_splats(2.0f); +const vfloat32 _4div_pi = vec_splats(1.27323954473516f); +const vfloat32 v_inf = (vfloat32)vec_splats(0x7f800000u); +const vfloat32 v_minus_inf = vfloat32{ 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u }; +const vfloat32 v_nan = (vfloat32)vec_splats(0x7fffffff); +const vfloat32 log10e_inv = vec_splats(0.43429448190325176f); +const vfloat32 log2e_inv = vec_splats(1.4426950408889634f); +const vfloat32 log2eB_inv = vec_splats(1.442695036924675f); +const vfloat32 cephes_SQRTHF = vec_splats(0.707106781186547524f); +const vfloat32 coscof_p0 = vec_splats(2.443315711809948E-005f); +const vfloat32 coscof_p1 = vec_splats(-1.388731625493765E-003f); +const vfloat32 coscof_p2 = vec_splats(4.166664568298827E-002f); +const vfloat32 exp_hi = vec_splats(104.f); +const vfloat32 exp_lo = vec_splats(-104.f); +const vfloat32 exp_p0 = vec_splats(0.000198527617612853646278381f); +const vfloat32 exp_p1 = vec_splats((0.00139304355252534151077271f)); +const vfloat32 exp_p2 = vec_splats(0.00833336077630519866943359f); +const vfloat32 exp_p3 = vec_splats(0.0416664853692054748535156f); +const vfloat32 exp_p4 = vec_splats(0.166666671633720397949219f); +const vfloat32 exp_p5 = vec_splats(0.5f); +const vfloat32 log_p0 = vec_splats(7.0376836292E-2f); +const vfloat32 log_p1 = vec_splats(-1.1514610310E-1f); +const vfloat32 log_p2 = vec_splats(1.1676998740E-1f); +const vfloat32 log_p3 = vec_splats(-1.2420140846E-1f); +const vfloat32 log_p4 = vec_splats(+1.4249322787E-1f); +const vfloat32 log_p5 = vec_splats(-1.6668057665E-1f); +const vfloat32 log_p6 = vec_splats(+2.0000714765E-1f); +const vfloat32 log_p7 = vec_splats(-2.4999993993E-1f); +const vfloat32 log_p8 = vec_splats(+3.3333331174E-1f); +const vfloat32 log_q1 = vec_splats(-2.12194440e-4f); +const vfloat32 log_q2 = vec_splats(0.693359375f); +const vfloat32 max_logf = vec_splats(88.02969187150841f); +const vfloat32 max_numf = vec_splats(1.7014117331926442990585209174225846272e38f); +const vfloat32 min_inf = (vfloat32)vec_splats(0xff800000u); +const vfloat32 min_norm_pos = (vfloat32)vec_splats(0x0800000u); +const vfloat32 minus_cephes_dp1 = vec_splats(-0.78515625f); +const vfloat32 minus_cephes_dp2 = vec_splats(-2.4187564849853515625e-4f); +const vfloat32 minus_cephes_dp3 = vec_splats(-3.77489497744594108e-8f); +const vfloat32 negln2f_hi = vec_splats(-0.693145751953125f); +const vfloat32 negln2f_lo = vec_splats(-1.428606765330187045e-06f); +const vfloat32 p0 = vec_splats(2.03721912945E-4f); +const vfloat32 p1 = vec_splats(8.33028376239E-3f); +const vfloat32 p2 = vec_splats(1.66667160211E-1f); +const vfloat32 sincof_p0 = vec_splats(-1.9515295891E-4f); +const vfloat32 sincof_p1 = vec_splats(8.3321608736E-3f); +const vfloat32 sincof_p2 = vec_splats(-1.6666654611E-1f); +const vfloat32 tanh_0p625 = vec_splats(0.625f); +const vfloat32 tanh_half_max = vec_splats(44.014845935754205f); +const vfloat32 tanh_p0 = vec_splats(-5.70498872745E-3f); +const vfloat32 tanh_p1 = vec_splats(2.06390887954E-2f); +const vfloat32 tanh_p2 = vec_splats(-5.37397155531E-2f); +const vfloat32 tanh_p3 = vec_splats(1.33314422036E-1f); +const vfloat32 tanh_p4 = vec_splats(-3.33332819422E-1f); +const vfloat32 vcheck = vec_splats((float)(1LL << 24)); +const vfloat32 imag_one = vfloat32{0.f, 1.f, 0.f, 1.f}; +const vfloat32 imag_half = vfloat32{0.f, 0.5f, 0.f, 0.5f}; +const vfloat32 sqrt2_2 = vfloat32{0.70710676908493042f, 0.70710676908493042, + 0.70710676908493042, 0.70710676908493042}; +const vfloat32 pi_2 = vfloat32{M_PI / 2, 0.0, M_PI / 2, 0.0}; +const vfloat32 vf_89 = vfloat32{89.f, 89.f, 89.f, 89.f}; +const vfloat64 vd_one = vec_splats(1.0); +const vfloat64 vd_zero = vec_splats(0.0); +const vfloat64 vd_log10e_inv = vec_splats(0.43429448190325176); +const vfloat64 vd_log2e_inv = vec_splats(1.4426950408889634); +const vfloat64 vd_imag_one = vfloat64{0.0, 1.0}; +const vfloat64 vd_imag_half = vfloat64{0.0, 0.5}; +const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757}; +const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0}; + +} // namespace +} // namespace vec256 +} // namespace at + diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp index d4c106477fe7..0c562f363731 100644 --- a/aten/src/ATen/native/DispatchStub.cpp +++ b/aten/src/ATen/native/DispatchStub.cpp @@ -11,12 +11,18 @@ namespace at { namespace native { static CPUCapability compute_cpu_capability() { auto envar = std::getenv("ATEN_CPU_CAPABILITY"); if (envar) { +#ifdef HAVE_VSX_CPU_DEFINITION + if (strcmp(envar, "vsx") == 0) { + return CPUCapability::VSX; + } +#else if (strcmp(envar, "avx2") == 0) { return CPUCapability::AVX2; } if (strcmp(envar, "avx") == 0) { return CPUCapability::AVX; } +#endif if (strcmp(envar, "default") == 0) { return CPUCapability::DEFAULT; } @@ -33,7 +39,11 @@ static CPUCapability compute_cpu_capability() { } } #endif +#ifdef HAVE_VSX_CPU_DEFINITION + return CPUCapability::VSX; +#else return CPUCapability::DEFAULT; +#endif } CPUCapability get_cpu_capability() { diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index 63e2462489be..0368fa9741e9 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -47,8 +47,12 @@ namespace at { namespace native { enum class CPUCapability { DEFAULT = 0, +#ifdef HAVE_VSX_CPU_DEFINITION + VSX = 1, +#else AVX = 1, AVX2 = 2, +#endif NUM_OPTIONS }; @@ -101,6 +105,12 @@ struct CAFFE2_API DispatchStub { AT_ASSERTM(AVX, "DispatchStub: missing AVX kernel"); return AVX; } +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + if (capability >= static_cast(CPUCapability::VSX)) { + AT_ASSERTM(VSX, "DispatchStub: missing VSX kernel"); + return VSX; + } #endif AT_ASSERTM(DEFAULT, "DispatchStub: missing default kernel"); return DEFAULT; @@ -124,6 +134,9 @@ struct CAFFE2_API DispatchStub { #ifdef HAVE_AVX2_CPU_DEFINITION static FnPtr AVX2; #endif +#ifdef HAVE_VSX_CPU_DEFINITION + static FnPtr VSX; +#endif }; namespace { @@ -173,10 +186,17 @@ struct RegisterHIPDispatch { #define REGISTER_AVX2_DISPATCH(name, fn) #endif +#ifdef HAVE_VSX_CPU_DEFINITION +#define REGISTER_VSX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, VSX, fn) +#else +#define REGISTER_VSX_DISPATCH(name, fn) +#endif + #define REGISTER_NO_CPU_DISPATCH(name, fn_type) \ REGISTER_ARCH_DISPATCH(name, DEFAULT, static_cast(nullptr)) \ REGISTER_AVX_DISPATCH(name, static_cast(nullptr)) \ - REGISTER_AVX2_DISPATCH(name, static_cast(nullptr)) + REGISTER_AVX2_DISPATCH(name, static_cast(nullptr)) \ + REGISTER_VSX_DISPATCH(name, static_cast(nullptr)) #define REGISTER_CUDA_DISPATCH(name, fn) \ static RegisterCUDADispatch name ## __register(name, fn); diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index db02f7a8fb16..a9d2e4f50e45 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -110,6 +110,12 @@ if(INTERN_BUILD_ATEN_OPS) endif(MSVC) endif(CXX_AVX2_FOUND) + if(CXX_VSX_FOUND) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION") + LIST(APPEND CPU_CAPABILITY_NAMES "VSX") + LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_VSX_FLAGS}") + endif(CXX_VSX_FOUND) + list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES) math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1") diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 6c3bed552533..968456c40490 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1638,6 +1638,7 @@ if(NOT INTERN_BUILD_MOBILE) add_compile_options(-DUSE_GCC_GET_CPUID) endif() + find_package(VSX) # checks VSX find_package(AVX) # checks AVX and AVX2 # we don't set -mavx and -mavx2 flags globally, but only for specific files diff --git a/cmake/Modules/FindVSX.cmake b/cmake/Modules/FindVSX.cmake new file mode 100644 index 000000000000..74691f9240fb --- /dev/null +++ b/cmake/Modules/FindVSX.cmake @@ -0,0 +1,35 @@ + +IF(CMAKE_SYSTEM_NAME MATCHES "Linux") + message("-- ") + EXEC_PROGRAM(LD_SHOW_AUXV=1 ARGS "/bin/true" OUTPUT_VARIABLE bintrue) + if(bintrue MATCHES "AT_PLATFORM:[ \\t\\n\\r]*([a-zA-Z0-9_]+)[ \\t\\n\\r]*") + if(CMAKE_MATCH_COUNT GREATER 0) + string(TOLOWER ${CMAKE_MATCH_1} platform) + if(${platform} MATCHES "^power") + message("-- POWER Platform: ${platform}") + SET(POWER_COMP TRUE CACHE BOOL "power ") + SET(CXX_VSX_FLAGS "${CXX_VSX_FLAGS} -mcpu=${platform} -mtune=${platform}" ) + endif() + endif() + endif() + SET(VSX_CODE " #include + int main() { + float __attribute__((aligned(16))) vptr_y[8] = { 1.0f,2.f,3.f,4.f,4.f,3.f,2.f,1.f }; + __vector float v_result = vec_add(vec_vsx_ld(0, vptr_y), vec_vsx_ld(16, vptr_y)); + return 0; + }") + #check_cxx_compiler_flag(-mvsx vsx_flag) + SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + SET(CMAKE_REQUIRED_FLAGS "-mvsx") + CHECK_C_SOURCE_COMPILES("${VSX_CODE}" C_VSX_FOUND) + CHECK_CXX_SOURCE_COMPILES("${VSX_CODE}" CXX_VSX_FOUND) + SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + if(CXX_VSX_FOUND) + message("-- VSX flag was set.") + SET(CXX_VSX_FLAGS "${CXX_VSX_FLAGS} -mvsx" ) + elseif(POWER_COMP) + message(WARNING "-- VSX flag was not set.") + endif() + message("-- ") +endif() + From f4226b5c90c1c7a4260b1575dbd06925f16c23f8 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Thu, 10 Dec 2020 14:01:36 -0800 Subject: [PATCH 147/250] [static runtime] add static subgraph fusion pass (#49185) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49185 This diff adds a fusion feature that will let us use static runtime for *parts* of the graph. This will prove useful in cases where fully eliminating control flow is hard etc. TODO: [x] factor out into separate fusion file [x] add python test case [x] add graph that isn't fully lowered test case [x] add graph that has weird list/tuple outputs test case the loop example looks quite good: ``` graph(%a.1 : Tensor, %b.1 : Tensor, %iters.1 : int): %12 : bool = prim::Constant[value=1]() # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:110:4 %c.2 : Tensor = prim::StaticSubgraph_0(%a.1, %b.1) %c : Tensor = prim::Loop(%iters.1, %12, %c.2) # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:110:4 block0(%i : int, %c.12 : Tensor): %c.10 : Tensor = prim::StaticSubgraph_1(%a.1, %c.12, %b.1) -> (%12, %c.10) return (%c) with prim::StaticSubgraph_0 = graph(%0 : Tensor, %4 : Tensor): %5 : int = prim::Constant[value=2]() %6 : Tensor = aten::mul(%4, %5) # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:109:12 %2 : int = prim::Constant[value=1]() %c.2 : Tensor = aten::add(%0, %6, %2) # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:109:8 return (%c.2) with prim::StaticSubgraph_1 = graph(%1 : Tensor, %7 : Tensor, %8 : Tensor): %9 : int = prim::Constant[value=1]() %c.4 : Tensor = aten::add(%7, %8, %9) # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:111:12 %5 : int = prim::Constant[value=2]() %c.7 : Tensor = aten::mul_(%c.4, %5) # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:112:8 %2 : int = prim::Constant[value=1]() %c.10 : Tensor = aten::sub_(%c.7, %1, %2) # /data/users/bwasti/fbsource/fbcode/buck-out/dev/gen/caffe2/test/static_runtime#binary,link-tree/test_static_runtime.py:113:8 return (%c.10) ``` (Note: this ignores all push blocking failures!) Test Plan: buck test mode/no-gpu //caffe2/benchmarks/static_runtime:static_runtime_cpptest buck test mode/no-gpu caffe2/test:static_runtime Reviewed By: bertmaher Differential Revision: D25385702 fbshipit-source-id: 2f24af4f11d92a959167facd03fbd24f464a6098 --- aten/src/ATen/core/interned_strings.h | 1 + .../static_runtime/test_static_runtime.cc | 32 +++ test/test_static_runtime.py | 73 +++++ tools/build_variables.bzl | 1 + torch/csrc/jit/ir/alias_analysis.cpp | 1 + torch/csrc/jit/runtime/interpreter.cpp | 1 + torch/csrc/jit/runtime/operator.cpp | 2 + torch/csrc/jit/runtime/static/fusion.cpp | 254 ++++++++++++++++++ torch/csrc/jit/runtime/static/fusion.h | 11 + torch/csrc/jit/runtime/static/impl.cpp | 10 +- torch/csrc/jit/runtime/static/impl.h | 3 + torch/csrc/jit/runtime/static/init.cpp | 21 +- 12 files changed, 406 insertions(+), 4 deletions(-) create mode 100644 torch/csrc/jit/runtime/static/fusion.cpp create mode 100644 torch/csrc/jit/runtime/static/fusion.h diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 7a74ec3b1736..70247924c736 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -39,6 +39,7 @@ namespace c10 { _(prim, FunctionalGraph) \ _(prim, DifferentiableGraph) \ _(prim, TensorExprGroup) \ + _(prim, StaticSubgraph) \ _(prim, If) \ _(prim, Jump) /* debug */ \ _(prim, JumpNZ) /* debug */ \ diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index 07f9ac253b9f..251e2654b013 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -1,4 +1,5 @@ #include +#include #include #include "deep_wide_pt.h" #include "test_scripts.h" @@ -249,3 +250,34 @@ TEST(StaticRuntime, CleanUpMemory) { } } } + +TEST(StaticRuntime, FusionPass) { + const int embedding_size = 32; + const int num_features = 50; + for (int batch_size : {1, 8, 32}) { + for (int i = 0; i < 2; ++i) { + torch::jit::Module module = getDeepAndWideSciptModel(); + auto ad_emb_packed = torch::randn({batch_size, 1, embedding_size}); + auto user_emb = torch::randn({batch_size, 1, embedding_size}); + auto wide = torch::randn({batch_size, num_features}); + + // run jit graph executor + std::vector inputs({ad_emb_packed, user_emb, wide}); + auto output_1 = getTensor(module.forward(inputs)); + + Method method = module.get_method("forward"); + auto graph = method.graph(); + fuseStaticSubgraphs(graph); + bool hit = false; + for (const auto& n : module.get_method("forward").graph()->nodes()) { + if (n->kind() == torch::jit::prim::StaticSubgraph) { + hit = true; + } + } + EXPECT_TRUE(hit); + auto output_2 = getTensor(module.forward(inputs)); + EXPECT_TRUE(output_1.equal(output_2)); + } + } +} + diff --git a/test/test_static_runtime.py b/test/test_static_runtime.py index 7a622b0e90c6..5c6c4714a518 100644 --- a/test/test_static_runtime.py +++ b/test/test_static_runtime.py @@ -105,6 +105,21 @@ def trivial_graph(a, b, c): s = torch.tensor([[3, 3], [3, 3]]) return a + b * c + s +def loop_graph(a, b, iters : int): + c = a + b * 2 + for i in range(iters): + c = c + b + c *= 2 + c -= a + return c + +def output_graph(a, b, c, iters : int): + s = torch.tensor([[3, 3], [3, 3]]) + k = a + b * c + s + d : Dict[int, Tensor] = {} + for i in range(iters): + d[i] = k + i + return d class TestStaticRuntime(TestCase): def test_multihead_attention_layer(self): @@ -203,5 +218,63 @@ def test_leaky_relu(self): o_test = tg_a(s)[0] torch.testing.assert_allclose(o_ref, o_test) + def test_fusion_trivial_graph(self): + s = torch.full((2, 2), 2) + tg = torch.jit.script(trivial_graph) + o_ref = tg(s, s, s) + torch._C._fuse_to_static_runtime(tg.graph) + assert "StaticSubgraph" in str(tg.graph) + o_test = tg(s, s, s) + torch.testing.assert_allclose(o_ref, o_test) + + def test_fusion_multihead_attention_layer(self): + HID_DIM = 256 + QUERY_LEN = 8 + BATCH_SIZE = 128 + LAYERS = 3 + HEADS = 8 + DROPOUT = 0.1 + device = torch.device("cpu") + attention = MultiHeadAttentionLayer(HID_DIM, HEADS, DROPOUT, device).to(device) + with torch.no_grad(): + src = torch.randn(BATCH_SIZE, QUERY_LEN, HID_DIM).to(device) + src_mask = (src > 0)[:, :, 0].unsqueeze(1).unsqueeze(2).to(device) + + attention.eval() + attention = torch.jit.script(attention) + attention.eval() + o_ref = attention(src, src, src, src_mask) + + torch._C._fuse_to_static_runtime(attention._c) + o_test = attention(src, src, src, src_mask) + + for a, b in zip(o_ref, o_test): + torch.testing.assert_allclose(a, b) + + def test_fusion_loop(self): + a = torch.randn(5, 5) + b = torch.randn(5, 5) + c = 4 + lg = torch.jit.script(loop_graph) + o_ref = lg(a, b, c) + torch._C._fuse_to_static_runtime(lg.graph) + assert "StaticSubgraph" in str(lg.graph) + o_test = lg(a, b, c) + torch.testing.assert_allclose(o_ref, o_test) + + def test_fusion_outputs(self): + a = torch.randn(2, 2) + b = torch.randn(2, 2) + c = 4 + og = torch.jit.script(output_graph) + o_ref = og(a, b, b, c) + torch._C._fuse_to_static_runtime(og.graph) + assert "StaticSubgraph" in str(og.graph) + o_test = og(a, b, b, c) + for i in o_ref.keys(): + torch.testing.assert_allclose(o_ref[i], o_test[i]) + + + if __name__ == "__main__": run_tests() diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 146abca386eb..9121b7c84537 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -266,6 +266,7 @@ core_sources_full_mobile = [ ] core_sources_full = core_sources_full_mobile + [ + "torch/csrc/jit/runtime/static/fusion.cpp", "torch/csrc/jit/runtime/static/impl.cpp", "torch/csrc/jit/runtime/static/ops.cpp", "torch/csrc/jit/runtime/static/passes.cpp", diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp index 67dbe193f11c..000bea53e0fc 100644 --- a/torch/csrc/jit/ir/alias_analysis.cpp +++ b/torch/csrc/jit/ir/alias_analysis.cpp @@ -486,6 +486,7 @@ void AliasDb::analyzeImpl(Node* node) { return analyzeGradOf(node); // TODO: think more about TensorExpr alias correctness case prim::TensorExprGroup: + case prim::StaticSubgraph: case prim::Constant: case prim::AutogradZero: case prim::AutogradAdd: diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index f8f35d1aa818..3a028175d9c3 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -320,6 +320,7 @@ struct CanEmitInline { // by the later BailOut in createBailoutBlock and its jf_index // will become invalid. v->node()->kind() != prim::TensorExprGroup && + v->node()->kind() != prim::StaticSubgraph && v->node()->kind() != prim::CudaFusionGroup && v->node()->kind() != prim::FusionGroup && v->node()->kind() != prim::BailOut && v->uses().size() == 1 && diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp index b9e0f5fbd3fe..1964679fda19 100644 --- a/torch/csrc/jit/runtime/operator.cpp +++ b/torch/csrc/jit/runtime/operator.cpp @@ -239,6 +239,7 @@ bool printerHasSpecialCaseFor(Symbol sym) { prim::CudaFusionGroup, // optimization pass adds it prim::CudaFusionGuard, // optimization pass adds it prim::TensorExprGroup, // optimization pass adds it + prim::StaticSubgraph, // optimization pass adds it prim::Load, // used in interpreter only prim::MMTreeReduce, // used as an optimization prim::MMBatchSide, // used as an optimization @@ -276,6 +277,7 @@ bool aliasAnalysisHasSpecialCaseFor(Symbol symbol) { prim::CudaFusionGroup, prim::DifferentiableGraph, prim::TensorExprGroup, + prim::StaticSubgraph, prim::FunctionalGraph, prim::Constant, prim::Uninitialized, diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp new file mode 100644 index 000000000000..fc8defe2dcfb --- /dev/null +++ b/torch/csrc/jit/runtime/static/fusion.cpp @@ -0,0 +1,254 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +void createFusionGroups(Block* block, AliasDb* aliasDb); + +void fuseStaticSubgraphs(std::shared_ptr graph) { + PrepareGraphForStaticRuntime(graph); + auto aliasDb = torch::make_unique(graph); + createFusionGroups(graph->block(), aliasDb.get()); + torch::jit::EliminateDeadCode(graph); +} + +Operation createStaticSubgraphRuntime(const Node* node) { + auto g = torch::jit::PrepareForStaticRuntime(node->g(attr::Subgraph)); + auto runtime = std::make_shared(g); + auto num_inputs = runtime->get_inference_module()->input_regs.size(); + return [runtime, num_inputs](Stack* stack) { + RECORD_FUNCTION("Static Runtime", std::vector()); + auto inps = torch::jit::last(stack, num_inputs); + std::vector t_inputs; + t_inputs.reserve(num_inputs); + for (const auto& inp : inps) { + t_inputs.emplace_back(inp.toTensor()); + } + torch::jit::drop(stack, num_inputs); + auto outputs = runtime->run(t_inputs); + for (auto& o : outputs) { + push_one(*stack, std::move(o)); + } + return 0; + }; +} + +RegisterOperators StaticSubgraphOps({torch::jit::Operator( + prim::StaticSubgraph, + createStaticSubgraphRuntime, + AliasAnalysisKind::INTERNAL_SPECIAL_CASE)}); + +#define REQ(cond) \ + if (!(cond)) { \ + GRAPH_DEBUG("Failed cond " #cond "\n"); \ + return false; \ + } + +bool canHandle(Node* node) { + for (Value* input : node->inputs()) { + // TODO checks + } + + auto kind = node->kind(); + if (kind.is_prim()) { + REQ(kind == prim::TupleConstruct || kind == prim::ListConstruct || + kind == prim::StaticSubgraph); + return true; + } + const Operator& op = node->getOperator(); + auto analysis = op.aliasAnalysisKind(); + if (AliasAnalysisKind::PURE_FUNCTION == analysis || + (AliasAnalysisKind::FROM_SCHEMA == analysis && + !node->schema().is_mutable())) { + return true; + } + return false; +} + +bool canMerge(Node* consumer, Node* producer, AliasDb* aliasDb) { + // Only fuse within a block + REQ(consumer->owningBlock() == producer->owningBlock()); + + // Symbolic checks + REQ(canHandle(producer) || producer->kind() == prim::StaticSubgraph); + TORCH_INTERNAL_ASSERT( + consumer->kind() == prim::StaticSubgraph || canHandle(consumer)); + + // Alias checks + REQ(aliasDb->couldMoveBeforeTopologically(producer, consumer)); + + // Ops that return aliases can only be folded if this is the only use. + if (producer->kind() == aten::slice || producer->kind() == aten::unsqueeze || + producer->kind() == prim::ConstantChunk) { + for (auto& use : producer->output(0)->uses()) { + REQ(use.user == consumer); + } + } + + return true; +} + +Node* getOrCreateStaticSubgraph(Node* n, AliasDb* aliasDb) { + if (n->hasAttribute(attr::Subgraph) && n->kind() == prim::StaticSubgraph) { + return n; + } + GRAPH_UPDATE("Creating a static subgraph::Group node from: ", *n); + return SubgraphUtils::createSingletonSubgraphAndUpdateAliasing( + n, prim::StaticSubgraph, *aliasDb); +} + +value_list sortReverseTopological(ArrayRef inputs, Block* b) { + value_list result; + for (auto i : inputs) { + if (i->node()->owningBlock() == b) { + result.push_back(i); + } + } + // Sort in reverse topological order + std::sort(result.begin(), result.end(), [&](Value* a, Value* b) { + return a->node()->isAfter(b->node()); + }); + return result; +} + +static void debugDumpFusionGroup(const std::string& msg, Node* n) { + GRAPH_DEBUG(msg, *n); + if (n->kind() == prim::StaticSubgraph) { + GRAPH_DEBUG(*n->g(attr::Subgraph)); + } +} + +c10::optional tryMerge( + Node* fusion_group, + Node* to_merge, + AliasDb* aliasDb) { + if (!canMerge(fusion_group, to_merge, aliasDb)) { + return c10::nullopt; + } + + std::vector nodes_to_merge = {to_merge}; + + if (to_merge->kind() == aten::cat) { + Node* listconstruct = to_merge->input(0)->node(); + nodes_to_merge.push_back(listconstruct); + } + + // First, try to move all the nodes we want to fuse next to the fusion + // group. + Node* move_point = fusion_group; + for (auto n : nodes_to_merge) { + GRAPH_UPDATE("Trying to move node next to fusion group: ", getHeader(n)); + if (!aliasDb->moveBeforeTopologicallyValid(n, move_point)) { + GRAPH_UPDATE("Failed to move because of AliasDb checks!"); + return c10::nullopt; + } + move_point = n; + } + + // Now all the nodes that we're going to fuse are moved next to the fusion + // group, so we can safely merge them into the fusion group subgraph. + fusion_group = getOrCreateStaticSubgraph(fusion_group, aliasDb); + + for (auto n : nodes_to_merge) { + GRAPH_UPDATE("Merging ", getHeader(n)); + SubgraphUtils::mergeNodeIntoSubgraphAndUpdateAliasing( + n, fusion_group, *aliasDb); + } + return fusion_group; +} + +std::pair createFusionGroup( + Node* fusion_node, + AliasDb* aliasDb) { + fusion_node = getOrCreateStaticSubgraph(fusion_node, aliasDb); + + GRAPH_DEBUG("Iteratively pull input nodes into the fusion group...\n"); + auto inputs = + sortReverseTopological(fusion_node->inputs(), fusion_node->owningBlock()); + for (auto input : inputs) { + debugDumpFusionGroup("Current fusion group: ", fusion_node); + GRAPH_DEBUG("Trying to merge: ", *input->node()); + if (auto maybe_fusion_group = + tryMerge(fusion_node, input->node(), aliasDb)) { + // we successfully merged, so the new group's `inputs` may have + // changed. So rescan the new group for more merging opportunities. + return std::make_pair( + maybe_fusion_group.value()->reverseIterator(), true); + } + } + + return std::make_pair(++fusion_node->reverseIterator(), false); +} + +std::pair scanNode(Node* n, AliasDb* aliasDb) { + GRAPH_DEBUG("Considering node:", *n); + + if (!canHandle(n)) { + return std::make_pair(++n->reverseIterator(), false); + } + + return createFusionGroup(n, aliasDb); +} + +void createFusionGroups(Block* block, AliasDb* aliasDb) { + bool any_changed = true; + while (any_changed) { + any_changed = false; + for (auto it = block->nodes().rbegin(); it != block->nodes().rend();) { + bool changed; + std::tie(it, changed) = scanNode(*it, aliasDb); + any_changed |= changed; + } + } + + for (Node* n : block->nodes()) { + for (Block* b : n->blocks()) { + createFusionGroups(b, aliasDb); + } + } + + // Try to merge adjacent fusion groups together. Because we have only merged + // by looking at graph inputs, without this we would not attempt to merge + // adjacent fusion groups that don't have a depdency on each other + + std::vector initial_fusion_groups; + for (Node* n : block->nodes()) { + if (n->kind() == prim::StaticSubgraph) { + initial_fusion_groups.push_back(n); + } + } + + Node* prev_fusion_group = + initial_fusion_groups.size() ? initial_fusion_groups[0] : nullptr; + + for (size_t i = 1; i < initial_fusion_groups.size(); ++i) { + // Try merging the just created fusion group into the previous one. + // If it did not work, then put the previous fusion group into + // fusion_groups vector - we will not touch it anymore in this loop. + // If merging suceeded, save the merged group as the "previous" fusion + // group so that we can try to merge the next one into it. + + Node* fusion_group = initial_fusion_groups[i]; + debugDumpFusionGroup( + "Trying to merge into the previous fusion group: ", prev_fusion_group); + if (auto merged_fusion_group = + tryMerge(prev_fusion_group, fusion_group, aliasDb)) { + prev_fusion_group = *merged_fusion_group; + debugDumpFusionGroup( + "Successfully merged into the previous fusion group: ", + prev_fusion_group); + } else { + GRAPH_DEBUG("Cannot merge into the previous fusion group"); + prev_fusion_group = fusion_group; + } + } +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/static/fusion.h b/torch/csrc/jit/runtime/static/fusion.h new file mode 100644 index 000000000000..5f0e30b8505b --- /dev/null +++ b/torch/csrc/jit/runtime/static/fusion.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +namespace torch { +namespace jit { + +TORCH_API void fuseStaticSubgraphs(std::shared_ptr graph); + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 35657b4ba791..b95263f71e81 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,14 +15,19 @@ namespace torch { namespace jit { -namespace { -void OptimizeGraph(std::shared_ptr& graph) { +void PrepareGraphForStaticRuntime(std::shared_ptr graph) { Inline(*graph); ConstantPropagation(graph); Canonicalize(graph); ConstantPropagation(graph); RemoveTensorMutation(graph); ConstantPropagation(graph); + EliminateDeadCode(graph); +} + +namespace { +void OptimizeGraph(std::shared_ptr& graph) { + PrepareGraphForStaticRuntime(graph); FuseInferenceOpsForSparseNN(graph); ConstantPropagation(graph); } diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 2eef530e778b..21ce26bf488d 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -83,6 +83,9 @@ struct TORCH_API InferenceModule { void init(); }; +TORCH_API void PrepareGraphForStaticRuntime( + std::shared_ptr g); + inline TORCH_API std::shared_ptr PrepareForStaticRuntime( const torch::jit::Module& m, InferenceModuleOptions opts = InferenceModuleOptions()) { diff --git a/torch/csrc/jit/runtime/static/init.cpp b/torch/csrc/jit/runtime/static/init.cpp index 3088e3bc5f36..4799b5bff974 100644 --- a/torch/csrc/jit/runtime/static/init.cpp +++ b/torch/csrc/jit/runtime/static/init.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include namespace torch { @@ -68,8 +70,23 @@ void initStaticRuntimeBindings(PyObject* module) { [](std::shared_ptr g) { return StaticRuntime(PrepareForStaticRuntime(g)); }) - .def("_jit_to_static_runtime", [](const torch::jit::Module& m) { - return StaticRuntime(PrepareForStaticRuntime(m)); + .def( + "_jit_to_static_runtime", + [](const torch::jit::Module& m) { + return StaticRuntime(PrepareForStaticRuntime(m)); + }) + .def( + "_fuse_to_static_runtime", + [](torch::jit::Module& module) { + module.eval(); + module = freeze_module(module); + + Method method = module.get_method("forward"); + auto graph = method.graph(); + fuseStaticSubgraphs(graph); + }) + .def("_fuse_to_static_runtime", [](std::shared_ptr g) { + fuseStaticSubgraphs(g); }); } From 84fce6d29ad7146ccb198c47aad17a267261da46 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 10 Dec 2020 15:17:30 -0800 Subject: [PATCH 148/250] [AARCH64] Fix HAS_VST1 check if compiled by clang (#49182) Summary: Use `UL` suffix supported by all C99 compatible compilers instead of `__AARCH64_UINT64_C`, which is a gcc specific extension Before the change this check would have failed as follows with a bug-free clang compiler with the following errors: ``` $ clang has_vst1.c has_vst1.c:5:41: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ has_vst1.c:5:79: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ has_vst1.c:6:41: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ has_vst1.c:6:79: warning: implicit declaration of function '__AARCH64_UINT64_C' is invalid in C99 [-Wimplicit-function-declaration] v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); ^ 4 warnings generated. /tmp/has_vst1-b1e162.o: In function `main': has_vst1.c:(.text+0x30): undefined reference to `__AARCH64_UINT64_C' ``` Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/49182 Reviewed By: walterddr Differential Revision: D25471994 Pulled By: malfet fbshipit-source-id: 0129a6f7aabc46aa117ef719d3a211449cb410f1 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62ea0a64d6c0..ba862b5a4d5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -684,8 +684,8 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") int main() { float a[] = {1.0, 1.0}; float32x4x2_t v; - v.val[0] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); - v.val[1] = vcombine_f32 (vcreate_f32 (__AARCH64_UINT64_C (0)), vcreate_f32 (__AARCH64_UINT64_C (0))); + v.val[0] = vcombine_f32 (vcreate_f32 (0UL), vcreate_f32 (0UL)); + v.val[1] = vcombine_f32 (vcreate_f32 (0UL), vcreate_f32 (0UL)); vst1q_f32_x2(a, v); return 0; }" HAS_VST1) From 69522410fa1032d5af18105e03d9769021a69a6c Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Thu, 10 Dec 2020 15:19:57 -0800 Subject: [PATCH 149/250] add user vs internal msg support in common_utils.TestCase (#48935) Summary: should fixes https://github.com/pytorch/pytorch/issues/48879. To test the effect of the messages: make test break, such as add `self.assertEqual(1, 2, "user_msg")` to any test * Before: ``` AssertionError: False is not true : user_msg ``` * After ``` AssertionError: False is not true : Scalars failed to compare as equal! Comparing 1 and 2 gives a difference of 1, but the allowed difference with rtol=0 and atol=0 is only 0! user_msg; ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/48935 Reviewed By: samestep Differential Revision: D25382153 Pulled By: walterddr fbshipit-source-id: 95633a9f664f4b05a28801786b12a10bd21ff431 --- test/test_testing.py | 6 ++ torch/testing/_internal/common_utils.py | 126 ++++++++++++++---------- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/test/test_testing.py b/test/test_testing.py index 4a1b33831a44..b87345186cb3 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -432,6 +432,12 @@ def test_isclose_atol_rtol_greater_than_zero(self, device, dtype): with self.assertRaises(RuntimeError): torch.isclose(t, t, atol=-1, rtol=-1) + def test_assert_messages(self, device): + self.assertIsNone(self._get_assert_msg(msg=None)) + self.assertEqual("\nno_debug_msg", self._get_assert_msg("no_debug_msg")) + self.assertEqual("no_user_msg", self._get_assert_msg(msg=None, debug_msg="no_user_msg")) + self.assertEqual("debug_msg\nuser_msg", self._get_assert_msg(msg="user_msg", debug_msg="debug_msg")) + instantiate_device_type_tests(TestTesting, globals()) if __name__ == '__main__': diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 80041a1c69de..f8280f9fb57d 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1039,6 +1039,13 @@ def _compareScalars(self, a, b, *, return _compare_scalars_internal(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan) + # Construct assert messages basd on internal debug message and user provided message. + def _get_assert_msg(self, msg, debug_msg=None): + if msg is None: + return debug_msg + else: + return f"\n{msg}" if debug_msg is None else f"{debug_msg}\n{msg}" + def assertEqualIgnoreType(self, *args, **kwargs) -> None: # If you are seeing this function used, that means test is written wrongly # and deserves detailed investigation @@ -1049,7 +1056,8 @@ def assertEqualIgnoreType(self, *args, **kwargs) -> None: def assertEqual(self, x, y, msg: Optional[str] = None, *, atol: Optional[float] = None, rtol: Optional[float] = None, equal_nan=True, exact_dtype=True, exact_device=False) -> None: - assert (atol is None) == (rtol is None), "If one of atol or rtol is specified the other must be, too" + assert (atol is None) == (rtol is None), "If one of atol or rtol is specified, then the other must be too" + debug_msg: Optional[str] = None # Tensor x Number and Number x Tensor comparisons if isinstance(x, torch.Tensor) and isinstance(y, Number): @@ -1065,39 +1073,42 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *, elif isinstance(y, torch.Tensor) and isinstance(x, np.bool_): self.assertEqual(x, y.item(), atol=atol, rtol=rtol, msg=msg, exact_dtype=exact_dtype, exact_device=exact_device) + # Tensor x Tensor elif isinstance(x, torch.Tensor) and isinstance(y, torch.Tensor): - super().assertEqual(x.is_sparse, y.is_sparse, msg=msg) - super().assertEqual(x.is_quantized, y.is_quantized, msg=msg) + debug_msg = ("Attempted to compare with different is_sparse settings: " + f"Expected: {x.is_sparse}; Actual: {y.is_sparse}.") + super().assertEqual(x.is_sparse, y.is_sparse, msg=self._get_assert_msg(msg=msg, debug_msg=debug_msg)) + debug_msg = ("Attempted to compare with different is_quantized settings: " + f"Expected: {x.is_quantized}; Actual: {y.is_quantized}.") + super().assertEqual(x.is_quantized, y.is_quantized, msg=self._get_assert_msg(msg=msg, debug_msg=debug_msg)) if x.is_sparse: if x.size() != y.size(): - debug_msg_sparse = ("Attempted to compare equality of tensors with different sizes. " - f"Got sizes {x.size()} and {y.size()}.") - if msg is None: - msg = debug_msg_sparse - self.assertTrue(False, msg=msg) + debug_msg_sparse = ("Attempted to compare equality of tensors with different sizes: " + f"Expected: {x.size()}; Actual: {y.size()}.") + super().assertTrue(False, msg=self._get_assert_msg(msg=msg, debug_msg=debug_msg_sparse)) x = x.coalesce() y = y.coalesce() - indices_result, debug_msg = self._compareTensors(x._indices(), y._indices(), - rtol=rtol, atol=atol, - equal_nan=equal_nan, exact_dtype=exact_dtype, - exact_device=exact_device) - - if not indices_result and msg is None: - assert debug_msg is not None - msg = "Sparse tensor indices failed to compare as equal! " + debug_msg - self.assertTrue(indices_result, msg=msg) - - values_result, debug_msg = self._compareTensors(x._values(), y._values(), - rtol=rtol, atol=atol, - equal_nan=equal_nan, exact_dtype=exact_dtype, - exact_device=exact_device) - - if not values_result and msg is None: - assert debug_msg is not None - msg = "Sparse tensor values failed to compare as equal! " + debug_msg - self.assertTrue(values_result, msg=msg) + indices_result, debug_msg_indices = self._compareTensors(x._indices(), y._indices(), + rtol=rtol, atol=atol, + equal_nan=equal_nan, exact_dtype=exact_dtype, + exact_device=exact_device) + + if not indices_result: + assert debug_msg_indices is not None + debug_msg = "Sparse tensor indices failed to compare as equal! " + debug_msg_indices + super().assertTrue(indices_result, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) + + values_result, debug_msg_values = self._compareTensors(x._values(), y._values(), + rtol=rtol, atol=atol, + equal_nan=equal_nan, exact_dtype=exact_dtype, + exact_device=exact_device) + + if not values_result: + assert debug_msg_values is not None + debug_msg = "Sparse tensor values failed to compare as equal! " + debug_msg_values + super().assertTrue(values_result, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) elif x.is_quantized and y.is_quantized: self.assertEqual(x.qscheme(), y.qscheme(), atol=atol, rtol=rtol, msg=msg, exact_dtype=exact_dtype, @@ -1121,30 +1132,33 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *, atol=atol, rtol=rtol, msg=msg, exact_dtype=exact_dtype, exact_device=exact_device) - result, debug_msg = self._compareTensors(x.int_repr().to(torch.int32), - y.int_repr().to(torch.int32), - atol=atol, rtol=rtol, - exact_dtype=exact_dtype, - exact_device=exact_device) + result, debug_msg_compare = self._compareTensors(x.int_repr().to(torch.int32), + y.int_repr().to(torch.int32), + atol=atol, rtol=rtol, + exact_dtype=exact_dtype, + exact_device=exact_device) - if not result and msg is None: - assert debug_msg is not None - msg = "Quantized representations failed to compare as equal! " + debug_msg - self.assertTrue(result, msg=msg) + if not result: + assert debug_msg_compare is not None + debug_msg = "Quantized representations failed to compare as equal! " + debug_msg_compare + super().assertTrue(result, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) else: - result, debug_msg = self._compareTensors(x, y, rtol=rtol, atol=atol, - equal_nan=equal_nan, exact_dtype=exact_dtype, - exact_device=exact_device) + result, debug_msg_generic = self._compareTensors(x, y, rtol=rtol, atol=atol, + equal_nan=equal_nan, exact_dtype=exact_dtype, + exact_device=exact_device) if not result: - assert debug_msg is not None - msg = msg or "Tensors failed to compare as equal!" - msg = f'{msg}\n{debug_msg}' - self.assertTrue(result, msg=msg) + assert debug_msg_generic is not None + debug_msg = "Tensors failed to compare as equal!" + debug_msg_generic + super().assertTrue(result, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) elif isinstance(x, string_classes) and isinstance(y, string_classes): - super().assertEqual(x, y, msg=msg) + debug_msg = ("Attempted to compare [string] types: " + f"Expected: {repr(x)}; Actual: {repr(y)}.") + super().assertEqual(x, y, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) elif type(x) == set and type(y) == set: - super().assertEqual(x, y, msg=msg) + debug_msg = ("Attempted to compare [set] types: " + f"Expected: {x}; Actual: {y}.") + super().assertEqual(x, y, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) elif isinstance(x, dict) and isinstance(y, dict): if isinstance(x, OrderedDict) and isinstance(y, OrderedDict): self.assertEqual(x.items(), y.items(), atol=atol, rtol=rtol, @@ -1161,23 +1175,27 @@ def assertEqual(self, x, y, msg: Optional[str] = None, *, exact_dtype=exact_dtype, exact_device=exact_device) elif isinstance(x, type) and isinstance(y, type): # See TestTorch.test_assert_equal_generic_meta - super().assertEqual(x, y, msg=msg) + debug_msg = ("Attempted to compare [type] types: " + f"Expected: {x}; Actual: {y}.") + super().assertEqual(x, y, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) elif is_iterable(x) and is_iterable(y): - super().assertEqual(len(x), len(y), msg=msg) + debug_msg = ("Attempted to compare the lengths of [iterable] types: " + f"Expected: {len(x)}; Actual: {len(y)}.") + super().assertEqual(len(x), len(y), msg=self._get_assert_msg(msg, debug_msg=debug_msg)) for x_, y_ in zip(x, y): self.assertEqual(x_, y_, atol=atol, rtol=rtol, msg=msg, exact_dtype=exact_dtype, exact_device=exact_device) elif isinstance(x, bool) and isinstance(y, bool): - self.assertTrue(x == y, msg=msg) + super().assertTrue(x == y, msg=msg) # Scalar x Scalar elif isinstance(x, Number) and isinstance(y, Number): - result, debug_msg = self._compareScalars(x, y, rtol=rtol, atol=atol, - equal_nan=equal_nan) - if not result and msg is None: - assert debug_msg is not None - msg = "Scalars failed to compare as equal! " + debug_msg - self.assertTrue(result, msg=msg) + result, debug_msg_scalars = self._compareScalars(x, y, rtol=rtol, atol=atol, + equal_nan=equal_nan) + if not result: + assert debug_msg_scalars is not None + debug_msg = "Scalars failed to compare as equal! " + debug_msg_scalars + super().assertTrue(result, msg=self._get_assert_msg(msg, debug_msg=debug_msg)) # Tensor x Numpy array elif isinstance(x, torch.Tensor) and isinstance(y, np.ndarray): self.assertEqual(x, torch.from_numpy(y), atol=atol, rtol=rtol, msg=msg, From 80f7510d9202bc46fa3faeb798ba0d7f6db2f67a Mon Sep 17 00:00:00 2001 From: James Reed Date: Thu, 10 Dec 2020 15:28:15 -0800 Subject: [PATCH 150/250] [FX] Fix create_arg for NamedTuple (#48986) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48986 Test Plan: Imported from OSS Reviewed By: zdevito Differential Revision: D25387156 Pulled By: jamesr66a fbshipit-source-id: 0d38c43e02088fb7afb671683c88b6e463fe7c76 --- test/test_fx.py | 6 ++++++ torch/fx/node.py | 7 +++++-- torch/fx/proxy.py | 8 +++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/test/test_fx.py b/test/test_fx.py index af11f9615cb6..221fae3c518a 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -1154,6 +1154,12 @@ def forward(self): m = FooBar1234() self.checkGraphModule(m, ()) + def test_namedtuple_return_trace(self): + class NamedTupReturn(torch.nn.Module): + def forward(self, x): + return Pair(x, x) + + traced = symbolic_trace(NamedTupReturn()) if __name__ == '__main__': run_tests() diff --git a/torch/fx/node.py b/torch/fx/node.py index fd8a4bc1377c..d304a4c0a472 100644 --- a/torch/fx/node.py +++ b/torch/fx/node.py @@ -248,9 +248,12 @@ def maybe_replace_node(n : Node) -> Node: def map_arg(a: Argument, fn: Callable[[Node], Argument]) -> Argument: """ Apply fn to each Node appearing arg. arg may be a list, tuple, slice, or dict with string keys. """ - if isinstance(a, tuple): + if isinstance(a, tuple) and hasattr(a, '_fields'): + elements = tuple(map_arg(elem, fn) for elem in a) + return type(a)(*elements) # type: ignore + elif isinstance(a, tuple): return tuple(map_arg(elem, fn) for elem in a) - if isinstance(a, list): + elif isinstance(a, list): return immutable_list(map_arg(elem, fn) for elem in a) elif isinstance(a, dict): return immutable_dict((k, map_arg(v, fn)) for k, v in a.items()) diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py index f8c4aa8d8366..ce406aa787ee 100644 --- a/torch/fx/proxy.py +++ b/torch/fx/proxy.py @@ -50,7 +50,13 @@ def create_arg(self, a: Any) -> Argument: Can be override to support more trace-specific types. """ # aggregates - if isinstance(a, (tuple, list)): + if isinstance(a, tuple) and hasattr(a, '_fields'): + # NamedTuple constructors don't seem to like getting a generator + # expression as an argument to their constructor, so build this + # intermediate tuple and unpack it into the NamedTuple constructor + args = tuple(self.create_arg(elem) for elem in a) + return type(a)(*args) # type: ignore + elif isinstance(a, (tuple, list)): return type(a)(self.create_arg(elem) for elem in a) elif isinstance(a, dict): r = {} From 57145c910f75c94c4af9ae2645bcbb97bda75021 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 10 Dec 2020 16:24:35 -0800 Subject: [PATCH 151/250] Revert D24711613: [pytorch][PR] Preserve submodule with __set_state__ in freezing Test Plan: revert-hammer Differential Revision: D24711613 (https://github.com/pytorch/pytorch/commit/a3e1bd1fb9ad1dbc8506068d206969ca08872404) Original commit changeset: 22e51417454a fbshipit-source-id: c2090b15fdba2d6c9dc1fbd987d32229dd898608 --- test/jit/test_freezing.py | 94 ------------------------- torch/csrc/jit/passes/freeze_module.cpp | 20 +----- 2 files changed, 2 insertions(+), 112 deletions(-) diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py index 6066835a5eaa..7b7490e80ec5 100644 --- a/test/jit/test_freezing.py +++ b/test/jit/test_freezing.py @@ -140,100 +140,6 @@ def forward(self, x): output_f = mf.forward(input) self.assertEqual(output_s, output_f) - def test_freeze_module_with_setstate(self): - class M(torch.nn.Module): - def __init__(self): - super(M, self).__init__() - self.tensor = torch.randn(2, 2) - - @torch.jit.export - def __getstate__(self): - return (self.tensor, self.training) - - @torch.jit.export - def __setstate__(self, state): - self.tensor = 2 * state[0] - self.training = state[1] - - def forward(self, x): - return x + self.tensor - - m = torch.jit.script(M()) - m.eval() - with self.assertRaisesRegex(RuntimeError, "cannot freeze a module that has __set_state__"): - mf = torch.jit.freeze(m) - - def test_freeze_module_with_submodule_setstate(self): - class M(torch.nn.Module): - def __init__(self): - super(M, self).__init__() - self.tensor = torch.randn(2, 2) - - @torch.jit.export - def __getstate__(self): - return (self.tensor, self.training) - - @torch.jit.export - def __setstate__(self, state): - self.tensor = 2 * state[0] - self.training = state[1] - - def forward(self, x): - return x + self.tensor - - class TestModule(nn.Module): - def __init__(self): - super(TestModule, self).__init__() - self.sub = M() - self.a = torch.randn(2, 2) - self.b = 4 - - def forward(self, x): - return self.sub(x) + self.a - - m = torch.jit.script(TestModule()) - m.eval() - input = torch.randn(2, 2) - output_s = m.forward(input) - mf = torch.jit.freeze(m) - - output_f = mf.forward(input) - buffer = io.BytesIO() - torch.jit.save(mf._c, buffer) - buffer.seek(0) - loaded = torch.jit.load(buffer) - output_l = loaded.forward(input) - - # Check if frozen module looks as below: - # module m { - # attributes { - # sub = ... - # } - # ... - # submodule { - # module m { - # attributes { - # training = - # tensor = ... - # } - # ... - # } - # } - # } - mf = mf._c - self.assertFalse(mf.hasattr('a')) - self.assertTrue(mf.hasattr('sub')) - self.assertTrue(mf.sub.hasattr('tensor')) - self.assertTrue(mf.sub.hasattr('training')) - - # __setstate__ is executed cloning the module for freezing - self.assertEqual(mf.sub.tensor, 2 * m.sub.tensor) - self.assertEqual(output_s + m.sub.tensor , output_f) - - # __setstate__ is execuded loading frozen module - self.assertEqual(loaded.sub.tensor, 2 * mf.sub.tensor) - self.assertEqual(output_l, mf.sub.tensor + output_f) - def test_freeze_module_with_fork(self): class SubModule(nn.Module): def __init__(self): diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 76ca595c3283..2778c7712f23 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -241,20 +241,11 @@ class AttributePropagator { } auto attr = attrModule.attr(name); - auto mptr = attrModule._ivalue(); if (n->kind() == prim::GetAttr) { auto type = n->output()->type(); // Do not record submodules. Their attributes are tracked // individually. - if (attr.isObject()) { - auto submodule = attr.toModule(); - if (submodule.find_method("__setstate__")) { - insertMutableAttr(name, attr, mptr); - } - continue; - } - - if (!AliasDb::isMutableType(attr.type())) { + if (attr.isObject() || !AliasDb::isMutableType(attr.type())) { continue; } usedAttrs_.insert(attr); @@ -265,6 +256,7 @@ class AttributePropagator { n->kind() == prim::GetAttr ? "attribute: " + name + " in %" + n->output()->debugName() + " has inplace writer" : "attribute: " + name + " is set"); + auto mptr = attrModule._ivalue(); insertMutableAttr(name, attr, mptr); } } else if (n->kind() == prim::fork) { @@ -533,11 +525,6 @@ class AttributePropagator { return true; } } - - if (subModule.find_method("__setstate__")) { - return true; - } - return preservedSubModule_.count(subModule._ivalue()); } @@ -764,9 +751,6 @@ Module freeze_module( std::vector preservedAttrs, bool freezeInterfaces, bool preserveParameters) { - TORCH_CHECK( - !module.find_method("__setstate__"), - "cannot freeze a module that has __set_state__"); Method method = module.get_method("forward"); // Check that module does not return itself. for (auto& output : method.graph()->outputs()) { From 5e8cfec332652d72957f34d74e7c962d2b5afa8d Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 10 Dec 2020 17:01:54 -0800 Subject: [PATCH 152/250] Add a newline before dependency graph output (#49127) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49127 Small change, but useful: it means that double-clicking the line lets you copy the url easily Test Plan: Imported from OSS Reviewed By: zdevito Differential Revision: D25450408 Pulled By: suo fbshipit-source-id: 8b13b971b444187a8de59c89cc8f60206035b2ad --- torch/package/exporter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/package/exporter.py b/torch/package/exporter.py index 7395ec96ccd3..b0d1f5472e16 100644 --- a/torch/package/exporter.py +++ b/torch/package/exporter.py @@ -192,7 +192,7 @@ def _get_source_of_module(self, module: types.ModuleType) -> str: if result is None: extra = '' if self.verbose: - extra = f' See the dependency graph for more info: {self._write_dep_graph(module.__name__)}' + extra = f' See the dependency graph for more info: \n{self._write_dep_graph(module.__name__)}' raise ValueError(f'cannot save source for module "{module.__name__}" because ' f'its source file "{filename}" could not be found.{extra}') return ''.join(result) @@ -411,7 +411,7 @@ def close(self): ... """ if self.verbose: - print(f"Dependency graph for exported package: {self._write_dep_graph()}") + print(f"Dependency graph for exported package: \n{self._write_dep_graph()}") # Write each tensor to a file named tensor/the_tensor_key in the zip archive for key in sorted(self.serialized_storages.keys()): From 7feec06dfe2a8ea53bdbc04858d05b9a00975e86 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Thu, 10 Dec 2020 17:38:18 -0800 Subject: [PATCH 153/250] Only 1 TensorImpl allocation in differentiable views. (#48896) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48896 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D25380895 Pulled By: ailzhang fbshipit-source-id: 4d565e6312e860a2ff185a3f8b552005ddd29695 --- torch/csrc/autograd/VariableTypeUtils.h | 12 +++++----- torch/csrc/autograd/variable.h | 32 ++++++++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index a4b89ee92639..e67815e5609a 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -134,7 +134,7 @@ template inline variable_list flatten_tensor_args(Args&&... ar } // See NOTE [ Autograd View Variables ] for details. -inline Tensor as_view(const Tensor & base, Tensor tensor, bool is_differentiable, +inline Tensor as_view(const Tensor & base, const Tensor& tensor, bool is_differentiable, c10::optional> view_func=c10::nullopt, CreationMeta creation_meta=CreationMeta::DEFAULT) { auto base_var = Variable(base); @@ -194,16 +194,16 @@ inline Tensor as_view(const Tensor & base, Tensor tensor, bool is_differentiable base_var = base_var._base(); } if (is_differentiable) { - return make_variable_differentiable_view(std::move(base_var), std::move(tensor), creation_meta, std::move(view_func)); + return make_variable_differentiable_view(std::move(base_var), tensor, creation_meta, std::move(view_func)); } else { TORCH_CHECK(creation_meta == CreationMeta::DEFAULT, "Non-differentiable views must have creation_meta=CreationMeta::DEFAULT"); - return make_variable_non_differentiable_view(std::move(base_var), std::move(tensor)); + return make_variable_non_differentiable_view(std::move(base_var), tensor); } } // See NOTE [ Autograd View Variables ] for details. -inline std::vector as_view(const Tensor & base, std::vector tensors, bool is_differentiable, +inline std::vector as_view(const Tensor & base, std::vector& tensors, bool is_differentiable, CreationMeta creation_meta=CreationMeta::DEFAULT) { auto base_var = Variable(base); if (base_var.is_view()) { @@ -211,11 +211,11 @@ inline std::vector as_view(const Tensor & base, std::vector tens } for(Tensor &tensor : tensors) { if (is_differentiable) { - tensor = make_variable_differentiable_view(base_var, std::move(tensor), creation_meta); + tensor = make_variable_differentiable_view(base_var, tensor, creation_meta); } else { TORCH_CHECK(creation_meta == CreationMeta::DEFAULT, "Non-differentiable views must have creation_meta=CreationMeta::DEFAULT"); - tensor = make_variable_non_differentiable_view(base_var, std::move(tensor)); + tensor = make_variable_non_differentiable_view(base_var, tensor); } } return tensors; diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h index 352e315de7ad..cb8a763f246b 100644 --- a/torch/csrc/autograd/variable.h +++ b/torch/csrc/autograd/variable.h @@ -449,17 +449,28 @@ struct TORCH_API DifferentiableViewMeta : public AutogradMeta { // Differentiable view. Track history with DifferentiableViewMeta. inline Variable make_variable_differentiable_view( Variable base, - at::Tensor data, + const at::Tensor& data, CreationMeta creation_meta, c10::optional> view_func = c10::nullopt) { if (data.defined()) { - auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( - /*version_counter=*/0, - /*allow_tensor_metadata_change=*/true); - data_impl_copy->set_autograd_meta(std::make_unique( - data_impl_copy.get(), std::move(base), std::move(view_func), - creation_meta)); - return Variable(data_impl_copy); + // If we already did a TensorImpl allocation for data, just reuse it. + // Otherwise(e.g tensor.swapdim(0, 0) when we return the same tensor as input), + // we have to use shallow_copy_and_detach to create a new TensorImpl to avoid + // moving leaf node into graph interior. This guarantees only 1 TensorImpl + // allocation happens in view ops. + if (data.getIntrusivePtr().unique() && data.getIntrusivePtr()->unique_version()) { + at::TensorImpl* data_impl = data.unsafeGetTensorImpl(); + data_impl->set_autograd_meta(std::make_unique( + data_impl, std::move(base), std::move(view_func), creation_meta)); + return data; + } else { + c10::intrusive_ptr data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( + /*version_counter=*/0, + /*allow_tensor_metadata_change=*/true); + data_impl_copy->set_autograd_meta(std::make_unique( + data_impl_copy.get(), std::move(base), std::move(view_func), creation_meta)); + return Variable(data_impl_copy); + } } return Variable(); } @@ -468,9 +479,12 @@ inline Variable make_variable_differentiable_view( // Non-differentiable view. Just share version counter. inline Variable make_variable_non_differentiable_view( Variable base, - at::Tensor data, + const at::Tensor& data, bool allow_tensor_metadata_change = true) { if (data.defined()) { + // Currently all of non-differentiable view ops(detach/_indices/_values) + // share the same TensorImpl as their base Tensor. Thus a new TensorImpl + // allocation here is required. auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach( /*version_counter=*/impl::version_counter(base), /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); From a47a087a4325696ddec345f2d631812329a69f0d Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Thu, 10 Dec 2020 17:45:31 -0800 Subject: [PATCH 154/250] [NNC] Add missing data type support for abs and frac (#48679) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48679 This addresses the remaining problem reported in issue #48053 Data type supports for aten kernels in SimpleIREvaluator are not consistent w/ aten::native library implementation. In SimpleIREvaluator, - only float/double are supported on aten::abs (integral types and half are missing) - only float/double are supported on aten::frac (half are missing) It is also not clear from kernel.cpp source code what are the expected input data types for an aten kernel, leading to potential missing data type issues down the road. This commit addresses both issues in a limited way by - Added type promotion ops from half/integral input types to float - Added a skeleton support for some type checking for aten kernels, currently, only check for valid data types for frac and abs to limit the scope of the change; but the utility function can be used for consistently adding type checking for all aten functions Known limitations: - abs support for integral types can be made more effective by invoking std::abs for integral tensors (currently kFabs maps to std::fabs). Since that change is a bit more involved (e.g., changing IntrinsicsOp kFabs to kAbs and other code generators accordingly), will leave it to another issue - other aten kernels may need similar type checking and some scrutiny on the use of promoteToFloat to detect invalid data types early on. That is also left for another issue Test Plan: test_jit_fuser_te.test_unary_ops Imported from OSS Reviewed By: asuhan Differential Revision: D25344839 fbshipit-source-id: 95aca04c99b947dc20f11e4b3bae002f0ae37044 --- test/test_jit_fuser_te.py | 1 - torch/csrc/jit/tensorexpr/kernel.cpp | 131 ++++++++++++++++++++------- torch/csrc/jit/tensorexpr/kernel.h | 19 +++- 3 files changed, 117 insertions(+), 34 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index f0552f0f7a36..e125f473f6d8 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1211,7 +1211,6 @@ def data_for(self, dtype, device="cuda", size=None): else: return v.to(dtype) - @unittest.skipIf(not LLVM_ENABLED, "TODO: bugs in ir eval") def test_unary_ops(self): def apply(fn): return lambda x: fn(x) diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index f42983d9499c..c4228ae955b6 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -436,17 +436,68 @@ ExprHandle TensorExprKernel::constant(const torch::jit::Value* v) { return scalars_.at(v->unique()); } -ExprHandle promoteIntegerToFloat(const ExprHandle& e) { +ExprHandle promoteIntegerToDefaultType(const ExprHandle& e) { auto scalarType = static_cast(e.dtype().scalar_type()); if (!c10::isIntegralType(scalarType, /*includeBool*/ true)) { return e; } - auto defaultType = static_cast( - c10::typeMetaToScalarType(c10::get_default_dtype())); - return Cast::make(Dtype(defaultType, e.dtype().lanes()), e); + + auto defaultType = c10::typeMetaToScalarType(c10::get_default_dtype()); + + // We intend to promote Integers to floating-point types + TORCH_INTERNAL_ASSERT( + !c10::isIntegralType(defaultType, /*includeBool*/ true)); + + return Cast::make( + Dtype( + static_cast(defaultType), e.dtype().lanes()), + e); } -void TensorExprKernel::promoteInputs(std::vector& inputs) { +ExprHandle promoteHalfToFloat(const ExprHandle& e) { + auto scalarType = static_cast(e.dtype().scalar_type()); + auto floatType = static_cast(tensorexpr::ScalarType::Float); + if (c10::isFloatingType(scalarType) && + (c10::elementSize(scalarType) < c10::elementSize(floatType))) { + return Cast::make( + Dtype(tensorexpr::ScalarType::Float, e.dtype().lanes()), e); + } else { + return e; + } +} + +ExprHandle promoteHalfToFloatAndIntegerToDefaultType(const ExprHandle& e) { + auto scalarType = static_cast(e.dtype().scalar_type()); + if (c10::isIntegralType(scalarType, /*includeBool*/ true)) { + return promoteIntegerToDefaultType(e); + } else { + return promoteHalfToFloat(e); + } +} + +bool TensorExprKernel::checkTypes( + const ScalarType highType, + const int typeConstraints) { + if (typeConstraints == kAllTypes) { + return true; + } + + if (is_integral(highType)) { + return (typeConstraints & kIntegralTypes) != 0; + } else if (is_floating_point(highType)) { + return (typeConstraints & kFloatingPointTypes) != 0; + } else if (highType == ScalarType::Bool) { + return (typeConstraints & kBoolType) != 0; + } + + // assume JIT not supporting complex and qint yet + TORCH_INTERNAL_ASSERT((typeConstraints & (kQintTypes | kComplexTypes)) == 0); + return false; +} + +void TensorExprKernel::promoteInputs( + std::vector& inputs, + const int typeConstraints) { if (inputs.empty()) { return; } @@ -457,6 +508,10 @@ void TensorExprKernel::promoteInputs(std::vector& inputs) { highType = promoteTypes(highType, input.dtype().scalar_type()); } + if (!checkTypes(highType, typeConstraints)) { + throw unsupported_dtype(); + } + for (ExprHandle& e : inputs) { e = promoteToDtype(e, highType); } @@ -563,19 +618,20 @@ std::vector TensorExprKernel::valueShape( Tensor* TensorExprKernel::computeOneOperand( const std::string& name, const torch::jit::Value* v, - const std::function& innerExpr) { + const std::function& innerExpr, + const int checkParamTypes) { auto const& n = v->node(); auto const& shape = valueShape(n->inputs()[0]); return Compute( name, c10::fmap(shape), - [this, v, innerExpr](const std::vector& axes) { + [this, v, innerExpr, checkParamTypes]( + const std::vector& axes) { auto const& n = v->node(); std::vector indices(axes.begin(), axes.end()); std::vector inputs = { tensorOrConstant(n->inputs()[0], indices)}; - - promoteInputs(inputs); + promoteInputs(inputs, checkParamTypes); ExprHandle compute = innerExpr(inputs[0]); return demoteOutput(compute, n->output()); }); @@ -791,7 +847,8 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::div: { return computeTwoOperand( "aten_div", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { - return promoteIntegerToFloat(lhs) / promoteIntegerToFloat(rhs); + return promoteIntegerToDefaultType(lhs) / + promoteIntegerToDefaultType(rhs); }); } break; @@ -938,7 +995,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::sigmoid: { return computeOneOperand("aten_sigmoid", v, [](const ExprHandle& a) { - return sigmoid(promoteIntegerToFloat(a)); + return sigmoid(promoteIntegerToDefaultType(a)); }); } break; @@ -963,25 +1020,25 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::log: { return computeOneOperand("aten_log", v, [](const ExprHandle& a) { - return log(promoteIntegerToFloat(a)); + return log(promoteIntegerToDefaultType(a)); }); } break; case aten::log10: { return computeOneOperand("aten_log10", v, [](const ExprHandle& a) { - return log10(promoteIntegerToFloat(a)); + return log10(promoteIntegerToDefaultType(a)); }); } break; case aten::log1p: { return computeOneOperand("aten_log1p", v, [](const ExprHandle& a) { - return log1p(promoteIntegerToFloat(a)); + return log1p(promoteIntegerToDefaultType(a)); }); } break; case aten::log2: { return computeOneOperand("aten_log2", v, [](const ExprHandle& a) { - return log2(promoteIntegerToFloat(a)); + return log2(promoteIntegerToDefaultType(a)); }); } break; @@ -992,37 +1049,37 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::expm1: { return computeOneOperand("aten_expm1", v, [](const ExprHandle& a) { - return expm1(promoteIntegerToFloat(a)); + return expm1(promoteIntegerToDefaultType(a)); }); } break; case aten::erf: { return computeOneOperand("aten_erf", v, [](const ExprHandle& a) { - return erf(promoteIntegerToFloat(a)); + return erf(promoteIntegerToDefaultType(a)); }); } break; case aten::erfc: { return computeOneOperand("aten_erfc", v, [](const ExprHandle& a) { - return erfc(promoteIntegerToFloat(a)); + return erfc(promoteIntegerToDefaultType(a)); }); } break; case aten::cos: { return computeOneOperand("aten_cos", v, [](const ExprHandle& a) { - return cos(promoteIntegerToFloat(a)); + return cos(promoteIntegerToDefaultType(a)); }); } break; case aten::sin: { return computeOneOperand("aten_sin", v, [](const ExprHandle& a) { - return sin(promoteIntegerToFloat(a)); + return sin(promoteIntegerToDefaultType(a)); }); } break; case aten::tan: { return computeOneOperand("aten_tan", v, [](const ExprHandle& a) { - return tan(promoteIntegerToFloat(a)); + return tan(promoteIntegerToDefaultType(a)); }); } break; @@ -1135,31 +1192,31 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::acos: { return computeOneOperand("aten_acos", v, [](const ExprHandle& a) { - return acos(promoteIntegerToFloat(a)); + return acos(promoteIntegerToDefaultType(a)); }); } break; case aten::asin: { return computeOneOperand("aten_asin", v, [](const ExprHandle& a) { - return asin(promoteIntegerToFloat(a)); + return asin(promoteIntegerToDefaultType(a)); }); } break; case aten::cosh: { return computeOneOperand("aten_cosh", v, [](const ExprHandle& a) { - return cosh(promoteIntegerToFloat(a)); + return cosh(promoteIntegerToDefaultType(a)); }); } break; case aten::sinh: { return computeOneOperand("aten_sinh", v, [](const ExprHandle& a) { - return sinh(promoteIntegerToFloat(a)); + return sinh(promoteIntegerToDefaultType(a)); }); } break; case aten::atan: { return computeOneOperand("aten_atan", v, [](const ExprHandle& a) { - return atan(promoteIntegerToFloat(a)); + return atan(promoteIntegerToDefaultType(a)); }); } break; @@ -1167,19 +1224,20 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { return computeTwoOperand( "aten_atan2", v, [](const ExprHandle& lhs, const ExprHandle& rhs) { return atan2( - promoteIntegerToFloat(lhs), promoteIntegerToFloat(rhs)); + promoteIntegerToDefaultType(lhs), + promoteIntegerToDefaultType(rhs)); }); } break; case aten::tanh: { return computeOneOperand("aten_tanh", v, [](const ExprHandle& a) { - return tanh(promoteIntegerToFloat(a)); + return tanh(promoteIntegerToDefaultType(a)); }); } break; case aten::sqrt: { return computeOneOperand("aten_sqrt", v, [](const ExprHandle& a) { - return sqrt(promoteIntegerToFloat(a)); + return tensorexpr::sqrt(promoteIntegerToDefaultType(a)); }); } break; @@ -1190,7 +1248,12 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::abs: { return computeOneOperand( - "aten_abs", v, [](const ExprHandle& a) { return fabs(a); }); + "aten_abs", + v, + [](const ExprHandle& a) { + return fabs(promoteHalfToFloatAndIntegerToDefaultType(a)); + }, + kIntegralTypes | kFloatingPointTypes | kBoolType); } break; case aten::ceil: { @@ -1235,7 +1298,13 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { case aten::frac: { return computeOneOperand( - "aten_frac", v, [](const ExprHandle& a) { return a - floor(a); }); + "aten_frac", + v, + [](const ExprHandle& a) { + auto aa = promoteHalfToFloat(a); + return aa - floor(aa); + }, + kFloatingPointTypes); } break; case aten::lgamma: { diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index e3a7e9e32ca6..8fcce23717d3 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -36,6 +36,16 @@ class TORCH_API TensorExprKernel { } private: + enum ElementType { + kAllTypes = 0, + kIntegralTypes = 1 << 0, + kFloatingPointTypes = 1 << 1, + kBoolType = 1 << 2, + kComplexTypes = 1 << 3, + kQintTypes = 1 << 4, + kNonComplexOrQintTypes = kIntegralTypes | kBoolType | kFloatingPointTypes, + }; + enum BackendType { kUninitialized, kSimpleIREval, @@ -71,7 +81,11 @@ class TORCH_API TensorExprKernel { std::vector valueShape(const torch::jit::Value* v); - void promoteInputs(std::vector& inputs); + bool checkTypes(const ScalarType highType, const int typeConstraints); + + void promoteInputs( + std::vector& inputs, + int typeConstraints = kAllTypes); ExprHandle demoteOutput(const ExprHandle& e, const torch::jit::Value* v); @@ -82,7 +96,8 @@ class TORCH_API TensorExprKernel { Tensor* computeOneOperand( const std::string& name, const torch::jit::Value* v, - const std::function& innerExpr); + const std::function& innerExpr, + const int checkParamTypes = kAllTypes); Tensor* computeTwoOperand( const std::string& name, From 882eb0f646e5042cb3e352fa3bfa45f8e23fc8e4 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 10 Dec 2020 18:10:10 -0800 Subject: [PATCH 155/250] [quant][graphmode][fx] Add support for dynamic quant for RNN and RNNCell (#49126) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49126 Test Plan: python test/test_quantization.py TestQuantizeFxOps.test_rnn python test/test_quantization.py TestQuantizeFxOps.test_rnn_cell Imported from OSS Reviewed By: raghuramank100 Differential Revision: D25449047 fbshipit-source-id: 532bf9ad2839958dde8c6f2d9399fac96b2b8bd4 --- test/quantization/test_quantize_fx.py | 47 +++++++++++++++++++ .../quantization/fx/quantization_patterns.py | 45 +++++++++++++++++- torch/quantization/quantization_mappings.py | 13 +++++ 3 files changed, 104 insertions(+), 1 deletion(-) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index f5f243a1e649..7c6c548f2594 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -28,6 +28,7 @@ default_qconfig, default_dynamic_qconfig, default_qat_qconfig, + per_channel_dynamic_qconfig, float16_dynamic_qconfig, float_qparams_weight_only_qconfig, get_default_qconfig, @@ -36,6 +37,7 @@ prepare, prepare_qat, convert, + quantize_dynamic, default_placeholder_observer, PerChannelMinMaxObserver, QConfigDynamic, @@ -57,6 +59,8 @@ from torch.testing._internal.common_quantization import ( LinearModelWithSubmodule, ResNetBase, + RNNDynamicModel, + RNNCellDynamicModel, ) from torch.testing._internal.common_quantized import ( @@ -2107,6 +2111,49 @@ def forward(self, indices, offsets): # make sure it runs m(*inputs) + def _test_rnn_impl(self, qconfigs, M, module_type_strs, module_types, sample_input): + options = itertools.product(qconfigs, module_type_strs) + for qconfig, module_type_str in options: + model_eager = M(module_type_str).eval() + model_graph = copy.deepcopy(model_eager) + if torch.backends.quantized.engine == 'qnnpack' and \ + qconfig is float16_dynamic_qconfig: + continue + # fp16 dynamic quant is not supported for qnnpack + + eager_qconfig_dict = {x : qconfig for x in module_types} + model_eager = quantize_dynamic(model_eager, qconfig_spec=eager_qconfig_dict) + + graph_qconfig_dict = { + "object_type": [ + (x, qconfig) for x in module_types + ] + } + model_graph = prepare_fx(model_graph, graph_qconfig_dict) + model_graph = convert_fx(model_graph) + self.assertEqual(model_eager(sample_input), model_graph(sample_input)) + self.checkScriptable(model_graph, [[sample_input]], True) + + def test_rnn_cell(self): + qconfigs = [per_channel_dynamic_qconfig, default_dynamic_qconfig, float16_dynamic_qconfig] + module_type_strs = ['LSTMCell', 'GRUCell', 'RNNTanh', 'RNNReLU'] + module_types = [torch.nn.LSTMCell, torch.nn.GRUCell, torch.nn.RNNCell] + sample_input = torch.tensor([[100, -155], + [-155, 100], + [100, -155]], dtype=torch.float) + self._test_rnn_impl(qconfigs, RNNCellDynamicModel, module_type_strs, module_types, sample_input) + + def test_rnn(self): + qconfigs = [per_channel_dynamic_qconfig, default_dynamic_qconfig, float16_dynamic_qconfig] + module_type_strs = ['LSTM'] + module_types = [torch.nn.LSTM] + niter = 10 + sample_input = torch.tensor([[100, -155], + [-155, 100], + [100, -155]], dtype=torch.float).unsqueeze(0).repeat(niter, 1, 1) + self._test_rnn_impl(qconfigs, RNNDynamicModel, module_type_strs, module_types, sample_input) + + class TestQuantizeFxModels(QuantizationTestCase): def _test_model_impl( self, mode, name, model, eager_quantizable_model, diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 176cd7603286..73590ad60904 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -11,6 +11,7 @@ from ..quantization_mappings import ( get_static_quant_module_class, + get_dynamic_quant_module_class, get_quantized_operator, ) from ..utils import ( @@ -471,7 +472,6 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, ] assert node.op == 'call_module' emb_node = node - emb = quantizer.modules[emb_node.target] qconfig = quantizer.qconfig_map[node.name] dtypes = get_qconfig_dtypes(qconfig) if dtypes not in supported_dtypes: @@ -481,6 +481,7 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None)) + emb = quantizer.modules[emb_node.target] qemb = get_static_quant_module_class(type(emb)) quantized = qemb.from_float(emb) parent_name, name = _parent_name(emb_node.target) @@ -491,6 +492,48 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, load_arg(quantized=False)(emb_node.args), load_arg(quantized=False)(emb_node.kwargs)) +# TODO (maybe): merge with embedding quantize handler +@register_quant_pattern(torch.nn.GRUCell) +@register_quant_pattern(torch.nn.LSTMCell) +@register_quant_pattern(torch.nn.RNNCell) +@register_quant_pattern(torch.nn.LSTM) +@mark_input_output_not_observed() +class RNNDynamic(QuantizeHandler): + def __init__(self, quantizer: QuantizerCls, node: Node): + super().__init__(quantizer, node) + + def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, + debug: bool = False, + convert_custom_config_dict: Dict[str, Any] = None) -> Node: + # Supported combinations are: + # quant_type | activation | weight | activation_compute_type + # dynamic | float32 | qint8 | quint8 + # dynamic | float16 | float16 | None + # tuple (activation_dtype, weight_dtype, compute_dtype) + supported_dtypes = [ + (torch.float32, torch.qint8, torch.quint8), + (torch.float16, torch.float16, None), + ] + assert node.op == 'call_module' + qconfig = quantizer.qconfig_map[node.name] + dtypes = get_qconfig_dtypes(qconfig) + if dtypes not in supported_dtypes: + warnings.warn( + "dtype combination: {} is not " + "supported by Embedding/EmbeddingBag, " + "supported dtype combinations are: {}".format(dtypes, supported_dtypes)) + return quantizer.quantized_graph.node_copy(node, load_arg(quantized=None)) + + module = quantizer.modules[node.target] + qmodule_cls = get_dynamic_quant_module_class(type(module)) + qmodule = qmodule_cls.from_float(module) + parent_name, name = _parent_name(node.target) + setattr(quantizer.modules[parent_name], name, qmodule) + return quantizer.quantized_graph.create_node( + 'call_module', + node.target, + load_arg(quantized=False)(node.args), + load_arg(quantized=False)(node.kwargs)) ARGS_TO_SKIP = { torch._ops.ops.quantized.hardswish: ['inplace'], diff --git a/torch/quantization/quantization_mappings.py b/torch/quantization/quantization_mappings.py index 88d264b1ccf3..c965de07deb7 100644 --- a/torch/quantization/quantization_mappings.py +++ b/torch/quantization/quantization_mappings.py @@ -124,6 +124,19 @@ def get_static_quant_module_class(float_module_class, additional_static_quant_ma " does not have a corresponding quantized module class" return static_quant_module_class +def get_dynamic_quant_module_class(float_module_class, additional_dynamic_quant_mapping=None): + r"""n Get the dynamically quantized module class corresponding to + the floating point module class + """ + if additional_dynamic_quant_mapping is None: + additional_dynamic_quant_mapping = {} + all_mappings = get_combined_dict(DEFAULT_DYNAMIC_QUANT_MODULE_MAPPINGS, additional_dynamic_quant_mapping) + dynamic_quant_module_class = all_mappings.get(float_module_class, None) + assert dynamic_quant_module_class is not None, \ + "Floating point module class {}".format(str(float_module_class)) + \ + " does not have a corresponding quantized module class" + return dynamic_quant_module_class + def get_default_qat_module_mappings(): ''' Get default module mapping for quantization aware training ''' From 0dea76ecdadc36ce950b53d4404fabe11c25b173 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 10 Dec 2020 18:12:21 -0800 Subject: [PATCH 156/250] Delete some dead functions from tools.codegen.api.meta (#49041) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49041 Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D25455886 Pulled By: ezyang fbshipit-source-id: 5d7834d52f7032820ac2c73358bda77187c17224 --- tools/codegen/api/meta.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/tools/codegen/api/meta.py b/tools/codegen/api/meta.py index a447850de38e..8e6ff19630e1 100644 --- a/tools/codegen/api/meta.py +++ b/tools/codegen/api/meta.py @@ -1,7 +1,6 @@ from tools.codegen.model import * from tools.codegen.api.types import MetaArgument -import tools.codegen.api.cpp as cpp import tools.codegen.api.dispatcher as dispatcher from typing import Sequence @@ -21,32 +20,6 @@ def argument_type(a: Argument) -> str: assert not a.is_write return dispatcher.argumenttype_type(a.type, mutable=False) -def returntype_type(t: Type) -> str: - r = cpp.valuetype_type(t) - if r is not None: - return r - - if isinstance(t, BaseType): - if t.name == BaseTy.Tensor: - return 'TensorMeta' - elif isinstance(t, ListType): - raise NotImplementedError("list returns not supported yet") - - raise AssertionError(f"unrecognized return type {t}") - -def return_type(r: Return) -> str: - assert not r.is_write - return returntype_type(r.type) - -def returns_type(rs: Sequence[Return]) -> str: - if len(rs) == 0: - return 'void' - elif len(rs) == 1: - return return_type(rs[0]) - else: - args = ','.join(map(return_type, rs)) - return f'std::tuple<{args}>' - def argument(a: Argument) -> MetaArgument: return MetaArgument( type=argument_type(a), From 267641a245ce85094fc02e14812d0fe4025c4129 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 10 Dec 2020 18:12:21 -0800 Subject: [PATCH 157/250] Rename positional and kwarg_only to have flat prefix (#49042) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49042 I want the names positional and kwarg_only to give the unflat representation (e.g., preserving TensorOptionsArguments in the returned Union). So I regret my original naming choice when I moved grouping to model. This renames them to have flat_ prefix and also adds a flat_non_out argument for cases where you just want to look at non-out arguments. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D25455884 Pulled By: ezyang fbshipit-source-id: f923f8881267a3e3e8e9521519412f7cc25034fc --- tools/autograd/gen_annotated_fn_args.py | 2 +- tools/autograd/gen_trace_type.py | 3 +-- tools/codegen/api/dispatcher.py | 6 +++++- tools/codegen/api/meta.py | 3 +-- tools/codegen/api/python.py | 8 +++----- tools/codegen/gen.py | 13 ++++++++----- tools/codegen/model.py | 25 ++++++++++++++++++------- 7 files changed, 37 insertions(+), 23 deletions(-) diff --git a/tools/autograd/gen_annotated_fn_args.py b/tools/autograd/gen_annotated_fn_args.py index c393c905c73f..943d9adab4a0 100644 --- a/tools/autograd/gen_annotated_fn_args.py +++ b/tools/autograd/gen_annotated_fn_args.py @@ -52,7 +52,7 @@ def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None: @with_native_function def gen_annotated_args(f: NativeFunction) -> str: out_args: List[Dict[str, Any]] = [] - for arg in f.func.arguments.positional: + for arg in f.func.arguments.flat_positional: if arg.default is not None: continue out_arg: Dict[str, Any] = {} diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index bd478b2de8d3..6bc83b9716e6 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -150,8 +150,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # Factories are a bit special because their out-of-place overloads # take an extra TensorOptions argument, which is missing in the _out function has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns) - has_tensor_input_arg = any(a.type.is_tensor_like() - for a in itertools.chain(f.func.arguments.positional, f.func.arguments.kwarg_only)) + has_tensor_input_arg = any(a.type.is_tensor_like() for a in f.func.arguments.flat_non_out) is_factory_method = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg) # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method` diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index b95803ca4e81..1193d0511281 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -68,7 +68,11 @@ def name(func: FunctionSchema) -> str: def arguments(func: FunctionSchema) -> Tuple[DispatcherArgument, ...]: if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return tuple(map(argument, itertools.chain(func.arguments.positional, func.arguments.kwarg_only, func.arguments.out))) + return tuple(map(argument, itertools.chain( + func.arguments.flat_positional, + func.arguments.flat_kwarg_only, + func.arguments.out + ))) else: return tuple( DispatcherArgument(type=la.type, name=la.name, argument=la.argument) diff --git a/tools/codegen/api/meta.py b/tools/codegen/api/meta.py index 8e6ff19630e1..6beee3eaefbb 100644 --- a/tools/codegen/api/meta.py +++ b/tools/codegen/api/meta.py @@ -4,7 +4,6 @@ import tools.codegen.api.dispatcher as dispatcher from typing import Sequence -import itertools # Follows dispatcher calling convention, but: # - Mutable arguments not allowed. Meta functions are always @@ -29,4 +28,4 @@ def argument(a: Argument) -> MetaArgument: def arguments(func: FunctionSchema) -> Sequence[MetaArgument]: assert not func.arguments.out - return list(map(argument, itertools.chain(func.arguments.positional, func.arguments.kwarg_only))) + return list(map(argument, func.arguments.flat_non_out)) diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index e7383d7cf76b..2dbf17d913ef 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -1,4 +1,3 @@ -import itertools from dataclasses import dataclass from typing import Optional, Union, Sequence, Set, List, Tuple, Dict @@ -734,8 +733,8 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> cpp_args = cpp.group_arguments(f.func, method=method, faithful=True) args = tuple(a for a in cpp_args if isinstance(a, Argument)) - input_arg_set = set(a.name for a in f.func.arguments.positional) - kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only) + input_arg_set = set(a.name for a in f.func.arguments.flat_positional) + kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) out_arg_set = set(a.name for a in f.func.arguments.out) input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args))) @@ -750,8 +749,7 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> # to the original versions in the yaml, this recreation is a potential # source of drift between eager and JIT. Pull this logic out to a shared place. - has_tensor_input_arg = any(a.type.is_tensor_like() - for a in itertools.chain(f.func.arguments.positional, f.func.arguments.kwarg_only)) + has_tensor_input_arg = any(a.type.is_tensor_like() for a in f.func.arguments.flat_non_out) if any(a.name == 'requires_grad' for a in f.func.schema_order_arguments()): raise ValueError('argument named requires_grad is reserved, should not explicitly add it in the schema') diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 36f3041fd617..87cde48b896b 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -486,11 +486,15 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: cuda_guard = "" if is_generic_dispatch_key(self.dispatch_key) or is_cuda_dispatch_key(self.dispatch_key): - self_args = (a for a in f.func.arguments.positional if a.name == "self") + self_arg = [f.func.arguments.self_arg.argument] if f.func.arguments.self_arg is not None else [] # There is precedence for which argument we use to do # device guard. This describes the precedence order. - candidate_args = itertools.chain(self_args, f.func.arguments.out, f.func.arguments.positional) + candidate_args = itertools.chain( + self_arg, + f.func.arguments.out, + f.func.arguments.flat_positional + ) # Only tensor like arguments are eligible device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) @@ -619,8 +623,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]: return None assert not f.func.is_out_fn() - assert len(f.func.arguments.positional) > 0 - assert sum(a.name == 'self' for a in f.func.arguments.positional) == 1 + assert f.func.arguments.self_arg is not None name = cpp.name(f.func) @@ -992,7 +995,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object: # These sets are used to conveniently test if an argument is a # kwarg-only or out argument - kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only) + kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) out_arg_set = set(a.name for a in f.func.arguments.out) sig_group = CppSignatureGroup.from_schema(f.func, method=False) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 8b60dfb4806c..c976df117e15 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -398,7 +398,11 @@ class FunctionSchema: returns: Tuple['Return', ...] def schema_order_arguments(self) -> Iterator['Argument']: - return itertools.chain(self.arguments.positional, self.arguments.kwarg_only, self.arguments.out) + return itertools.chain( + self.arguments.flat_positional, + self.arguments.flat_kwarg_only, + self.arguments.out + ) @staticmethod def parse(func: str) -> 'FunctionSchema': @@ -428,7 +432,7 @@ def __post_init__(self) -> None: # This means that all mutable returns should be aliased to a keyword argument # (except for "self", which we explicitly don't treat as an out argument because of its use in methods) # See Note [is_out_fn] - out_and_self = list(self.arguments.out) + [arg for arg in self.arguments.positional if arg.name == "self"] + out_and_self = list(self.arguments.out) + [arg for arg in self.arguments.flat_positional if arg.name == "self"] mutable_returns = [ret for ret in self.returns if ret.annotation is not None and ret.annotation.is_write] for ret in mutable_returns: assert any([ret.annotation == arg.annotation for arg in out_and_self]), \ @@ -899,7 +903,14 @@ class Arguments: out: Tuple[Argument, ...] # these are also kwarg-only @property - def positional(self) -> Sequence[Argument]: + def flat_non_out(self) -> Sequence[Argument]: + ret: List[Argument] = [] + ret.extend(self.flat_positional) + ret.extend(self.flat_kwarg_only) + return ret + + @property + def flat_positional(self) -> Sequence[Argument]: ret: List[Argument] = [] ret.extend(self.pre_self_positional) if self.self_arg is not None: @@ -909,7 +920,7 @@ def positional(self) -> Sequence[Argument]: # NB: doesn't contain out arguments @property - def kwarg_only(self) -> Sequence[Argument]: + def flat_kwarg_only(self) -> Sequence[Argument]: ret: List[Argument] = [] ret.extend(self.pre_tensor_options_kwarg_only) if self.tensor_options is not None: @@ -1056,10 +1067,10 @@ def pred(name: str, ty: Type) -> Callable[[Argument], bool]: def __str__(self) -> str: all_arguments: List[str] = [] - all_arguments.extend(map(str, self.positional)) - if self.kwarg_only or self.out: + all_arguments.extend(map(str, self.flat_positional)) + if self.flat_kwarg_only or self.out: all_arguments.append('*') - all_arguments.extend(map(str, self.kwarg_only)) + all_arguments.extend(map(str, self.flat_kwarg_only)) all_arguments.extend(map(str, self.out)) return ', '.join(all_arguments) From 9b0ffb9fb3a3b1b47c2d39270adf4f170f6a2141 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 10 Dec 2020 18:12:21 -0800 Subject: [PATCH 158/250] Delete cpp.group_arguments (#49043) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49043 Previously, this function had nontrivial algorithmic content, but after #48195, this was just a swiss army knife for pasting together arguments while maintaining structure. I added some more properties for Arguments for convenient access in this way, and then inlined the implementation of group_arguments into all of its call sites, simplifying whenever contextual. This might be controversial, but I think the resulting code is easier to understand. You may notice that there is some modest code duplication between dispatcher.cpparguments_exprs and CppSignature.argument_packs. This is a known problem and I will be attempting to fix it in a follow up PR. Confirmed to be byte-for-byte compatible. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D25455885 Pulled By: ezyang fbshipit-source-id: 8fbe066e8c3cb7ee8adb5b87296ec5bd7b49e01f --- tools/codegen/api/cpp.py | 32 +++++++++----------------------- tools/codegen/api/dispatcher.py | 17 ++++++++++++----- tools/codegen/api/native.py | 10 ++++++++-- tools/codegen/api/python.py | 16 ++++++++++------ tools/codegen/api/types.py | 18 +++++++++++------- tools/codegen/gen.py | 3 ++- tools/codegen/model.py | 27 ++++++++++++++++++++++++++- 7 files changed, 78 insertions(+), 45 deletions(-) diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index ea7179fdc599..f2f6edb88983 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -256,14 +256,21 @@ def argument_not_this( def argument( a: Union[Argument, TensorOptionsArguments, SelfArgument], + *, + method: bool, ) -> Union[CppSingleArgumentPack, CppThisArgumentPack]: if isinstance(a, SelfArgument): - return CppThisArgumentPack(argument=a, type=argument_type(a.argument)) + if method: + return CppThisArgumentPack(argument=a, type=argument_type(a.argument)) + else: + return CppSingleArgumentPack(argument_not_this(a.argument)) else: return CppSingleArgumentPack(argument_not_this(a)) def argument_faithful( a: Union[Argument, TensorOptionsArguments, SelfArgument], + *, + method: bool, ) -> CppArgumentPack: if isinstance(a, TensorOptionsArguments): return CppTensorOptionsArgumentPack( @@ -274,25 +281,4 @@ def argument_faithful( pin_memory=argument_not_this(a.pin_memory), ) else: - return argument(a) - -def group_arguments( - func: FunctionSchema, *, method: bool, faithful: bool, -) -> Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]]: - args: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = [] - if not faithful: - args.extend(func.arguments.out) - args.extend(func.arguments.pre_self_positional) - if func.arguments.self_arg is not None: - if method: - args.append(func.arguments.self_arg) - else: - args.append(func.arguments.self_arg.argument) - args.extend(func.arguments.post_self_positional) - args.extend(func.arguments.pre_tensor_options_kwarg_only) - if func.arguments.tensor_options is not None: - args.append(func.arguments.tensor_options) - args.extend(func.arguments.post_tensor_options_kwarg_only) - if faithful: - args.extend(func.arguments.out) - return args + return argument(a, method=method) diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index 1193d0511281..165b68e3a830 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -6,7 +6,7 @@ import tools.codegen.local as local import itertools -from typing import Sequence, Optional, Tuple +from typing import Sequence, Optional, Tuple, List, Union # This file describes the translation of JIT schema to the dispatcher # API, the *unboxed* calling convention by which invocations through @@ -142,16 +142,23 @@ def cppargument_exprs( assert_never(a) def cpparguments_exprs(func: FunctionSchema, * , method: bool, api_is_faithful: bool) -> Sequence[DispatcherExpr]: - dispatcher_calling_convention_is_faithful = local.use_c10_dispatcher().dispatcher_uses_new_style() - arguments = cpp.group_arguments(func, method=method, faithful=dispatcher_calling_convention_is_faithful) + dispatcher_is_faithful = local.use_c10_dispatcher().dispatcher_uses_new_style() + + arguments: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] + if dispatcher_is_faithful: + arguments.extend(func.arguments.non_out) + arguments.extend(func.arguments.out) + else: + arguments.extend(func.arguments.out) + arguments.extend(func.arguments.non_out) if api_is_faithful: argument_packs = tuple( - cpp.argument_faithful(a) for a in arguments + cpp.argument_faithful(a, method=method) for a in arguments ) else: argument_packs = tuple( - cpp.argument(a) for a in arguments + cpp.argument(a, method=method) for a in arguments ) return _cpparguments_exprs(argument_packs) diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py index 620e1c8cbf8c..7ae0325ec324 100644 --- a/tools/codegen/api/native.py +++ b/tools/codegen/api/native.py @@ -4,7 +4,7 @@ import tools.codegen.api.cpp as cpp from tools.codegen import local -from typing import Union, Sequence, Tuple +from typing import Union, Sequence, Tuple, List # This file describes the translation of JIT schema to the native functions API. # This looks a lot like the C++ API (which makes historical sense, because the @@ -105,5 +105,11 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> Sequen assert_never(a) def arguments(func: FunctionSchema) -> Tuple[NativeArgument, ...]: - args = cpp.group_arguments(func, method=False, faithful=local.use_c10_dispatcher() is UseC10Dispatcher.full) + args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] + if local.use_c10_dispatcher() is UseC10Dispatcher.full: + args.extend(func.arguments.non_out) + args.extend(func.arguments.out) + else: + args.extend(func.arguments.out) + args.extend(func.arguments.non_out) return tuple(i for arg in args for i in argument(arg)) diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 2dbf17d913ef..45fa1685a5cf 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -581,8 +581,7 @@ def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature: return CppSignatureGroup.from_schema(f.func, method=method).signature def has_tensor_options(f: NativeFunction) -> bool: - return any(filter(lambda a: isinstance(a, TensorOptionsArguments), - cpp.group_arguments(f.func, method=False, faithful=True))) + return f.func.arguments.tensor_options is not None # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # @@ -726,12 +725,17 @@ def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str: # Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> PythonSignature: - # Use cpp api to gather TensorOptions fields from kwargs. + args: List[Argument] = [] + args.extend(f.func.arguments.pre_self_positional) # Skip SelfArgument if this is method. - # Skip TensorOptionsArguments in C++ signature. Python side TensorOptions + if not method and f.func.arguments.self_arg is not None: + args.append(f.func.arguments.self_arg.argument) + args.extend(f.func.arguments.post_self_positional) + args.extend(f.func.arguments.pre_tensor_options_kwarg_only) + # Skip TensorOptionsArguments. Python side TensorOptions # arguments are created based on different rules - see below. - cpp_args = cpp.group_arguments(f.func, method=method, faithful=True) - args = tuple(a for a in cpp_args if isinstance(a, Argument)) + args.extend(f.func.arguments.post_tensor_options_kwarg_only) + args.extend(f.func.arguments.out) input_arg_set = set(a.name for a in f.func.arguments.flat_positional) kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index 6ccc470e74b6..347c4876a73a 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -1,6 +1,6 @@ from tools.codegen.model import * from dataclasses import dataclass -from typing import Optional, Union, Sequence, Tuple, TypeVar +from typing import Optional, Union, Sequence, Tuple, TypeVar, List _T = TypeVar('_T') @@ -167,21 +167,25 @@ def arguments(self) -> Sequence[CppArgument]: # high-level structure of the arguments so you may find it easier to do # translations working with this representation. def argument_packs(self) -> Sequence[CppArgumentPack]: - grouped_args = cpp.group_arguments(self.func, method=self.method, faithful=self.faithful) + arguments: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = [] + if self.faithful: + arguments.extend(self.func.arguments.non_out) + arguments.extend(self.func.arguments.out) + else: + arguments.extend(self.func.arguments.out) + arguments.extend(self.func.arguments.non_out) + if self.faithful: - # Faithful signatures will ungroup arguments into argument - # packs. - # # After this, manually do overload disambiguation, by # dropping defaults from the faithful signature. In # principle, we should be able to do this at some later # point in time with other overload disambiguation argument_packs = tuple( - cpp.argument_faithful(a).no_default() for a in grouped_args + cpp.argument_faithful(a, method=self.method).no_default() for a in arguments ) else: argument_packs = tuple( - cpp.argument(a) for a in grouped_args + cpp.argument(a, method=self.method) for a in arguments ) return argument_packs diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 87cde48b896b..14bbadce1570 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -1017,7 +1017,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object: ] cpp_schema_order_types = [ - cpp.argument(a).type for a in schema_order_jit_arguments + # NB: method here doesn't matter + cpp.argument(a, method=False).type for a in schema_order_jit_arguments ] cpp_returns = cpp.returns_type(f.func.returns) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index c976df117e15..c0fc94570d94 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -1,7 +1,7 @@ import re from dataclasses import dataclass -from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn, Sequence, Callable +from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn, Sequence, Callable, Union from enum import Enum import itertools @@ -928,6 +928,31 @@ def flat_kwarg_only(self) -> Sequence[Argument]: ret.extend(self.post_tensor_options_kwarg_only) return ret + @property + def non_out(self) -> Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]: + ret: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = [] + ret.extend(self.positional) + ret.extend(self.kwarg_only) + return ret + + @property + def positional(self) -> Sequence[Union[Argument, SelfArgument]]: + ret: List[Union[Argument, SelfArgument]] = [] + ret.extend(self.pre_self_positional) + if self.self_arg is not None: + ret.append(self.self_arg) + ret.extend(self.post_self_positional) + return ret + + @property + def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]: + ret: List[Union[Argument, TensorOptionsArguments]] = [] + ret.extend(self.pre_tensor_options_kwarg_only) + if self.tensor_options is not None: + ret.append(self.tensor_options) + ret.extend(self.post_tensor_options_kwarg_only) + return ret + def signature(self) -> 'Arguments': # dataclasses.replace could be used here, but it is less # type safe so for now I've opted to type everything out From 5469aa5e7f2740ccd2bd2cb6bc30b9e38731be08 Mon Sep 17 00:00:00 2001 From: Nick Gibson Date: Thu, 10 Dec 2020 19:42:12 -0800 Subject: [PATCH 159/250] [NNC] Add a non functional Tensor kind (#48750) Summary: Adds the CompoundTensor, a specialisation of the NNC Tensor which allows arbitrary production statements. This will allow lowering of aten ops into specific NNC IR patterns (which don't need to be functional) - allowing us to shortcut to the optimized form of common patterns. This is part 1 of trying to clean up the lowering of aten::cat so it is easier to optimize. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48750 Reviewed By: tugsbayasgalan Differential Revision: D25433517 Pulled By: nickgg fbshipit-source-id: de13c4719f8f87619ab254e5f324f13b5be1c9da --- test/cpp/tensorexpr/test_loopnest.cpp | 86 ++++++++++++++++++++++++ torch/csrc/jit/tensorexpr/codegen.h | 2 +- torch/csrc/jit/tensorexpr/ir_printer.cpp | 5 ++ torch/csrc/jit/tensorexpr/loopnest.cpp | 11 ++- torch/csrc/jit/tensorexpr/tensor.cpp | 2 +- torch/csrc/jit/tensorexpr/tensor.h | 28 ++++++-- 6 files changed, 127 insertions(+), 7 deletions(-) diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp index 19b466dfb604..aa44da858abf 100644 --- a/test/cpp/tensorexpr/test_loopnest.cpp +++ b/test/cpp/tensorexpr/test_loopnest.cpp @@ -3649,5 +3649,91 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) { torch::jit::testing::FileCheck().run(expected_ir2, oss.str()); } +TEST(LoopNest, CompoundTensorSimple) { + KernelScope kernel_scope; + + BufHandle a_buf("A", {10, 5}, kInt); + VarHandle i("i", kInt); + VarHandle j("j", kInt); + VarHandle x("x", kInt); + VarHandle y("y", kInt); + auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j, 1)}); + auto inner_for1 = For::make(j, 0, 5, for_body1); + auto outer_for1 = For::make(i, 0, 10, inner_for1); + auto for_body2 = Block::make( + {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}, 1) + x + y, 1)}); + auto inner_for2 = For::make(y, 0, 5, for_body2); + auto outer_for2 = For::make(x, 0, 10, inner_for2); + Block* body = Block::make({outer_for1, outer_for2}); + + Tensor* A = new CompoundTensor(a_buf.node(), {i.node(), j.node()}, body); + + LoopNest l({A}); + l.prepareForCodegen(); + + std::vector a_data(50, 0); + + Stmt* s = IRSimplifier::simplify(l.root_stmt()); + SimpleIREvaluator cg(s, {A}); + + std::vector a_ref(50, 0); + + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 5; ++j) { + a_ref[i * 5 + j] = (i * j) + i + j; + } + } + cg.call({a_data}); + + assertAllEqual(a_data, a_ref); +} + +TEST(LoopNest, CompoundTensorUsed) { + KernelScope kernel_scope; + + BufHandle a_buf("A", {10, 5}, kInt); + VarHandle i("i", kInt); + VarHandle j("j", kInt); + VarHandle x("x", kInt); + VarHandle y("y", kInt); + auto for_body1 = Block::make({Store::make(a_buf, {i, j}, i * j, 1)}); + auto inner_for1 = For::make(j, 0, 5, for_body1); + auto outer_for1 = For::make(i, 0, 10, inner_for1); + auto for_body2 = Block::make( + {Store::make(a_buf, {x, y}, Load::make(a_buf, {x, y}, 1) + x + y, 1)}); + auto inner_for2 = For::make(y, 0, 5, for_body2); + auto outer_for2 = For::make(x, 0, 10, inner_for2); + Block* body = Block::make({outer_for1, outer_for2}); + + Tensor* A = new CompoundTensor(a_buf.node(), {i.node(), j.node()}, body); + Tensor* B = Compute( + "B", {{10, "i"}, {3, "j"}}, [&](const VarHandle& i, const VarHandle& j) { + return A->call(i, j + 1) + A->call(i, j + 2); + }); + + LoopNest l({B}); + ASSERT_FALSE(l.computeInline(A->buf())); + l.prepareForCodegen(); + + std::vector a_data(50, 0); + std::vector b_data(50, 0); + + Stmt* s = IRSimplifier::simplify(l.root_stmt()); + std::cout << *s << "\n "; + SimpleIREvaluator cg(s, {B}); + + std::vector b_ref(50, 0); + + auto AT = [](int i, int j) { return i * j + i + j; }; + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 3; ++j) { + b_ref[i * 3 + j] = AT(i, j + 1) + AT(i, j + 2); + } + } + cg.call({b_data}); + + assertAllEqual(b_data, b_ref); +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/codegen.h b/torch/csrc/jit/tensorexpr/codegen.h index 6bf3456e3b85..e16a9e2c5d31 100644 --- a/torch/csrc/jit/tensorexpr/codegen.h +++ b/torch/csrc/jit/tensorexpr/codegen.h @@ -82,7 +82,7 @@ class CodeGen::BufferArg { BufferArg(const Placeholder& buffer) : var_(buffer.data()->base_handle()), dtype_(buffer.dtype()) {} BufferArg(Tensor* tensor) - : var_(tensor->buf()->base_handle()), dtype_(tensor->body()->dtype()) {} + : var_(tensor->buf()->base_handle()), dtype_(tensor->buf()->dtype()) {} BufferArg(const VarHandle& var) : var_(var.node()), dtype_(var.dtype()), isVar_(true) {} diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp index 848bd70cf5c7..1df2f96671df 100644 --- a/torch/csrc/jit/tensorexpr/ir_printer.cpp +++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp @@ -596,6 +596,11 @@ std::string to_string(const Tensor* t) { return "(null tensor)\n"; } std::ostringstream oss; + if (!t->body()) { + oss << "Tensor " << t->buf()->name_hint() << " = " << *t->ElementStmt() + << "\n"; + return oss.str(); + } oss << "Tensor " << t->buf()->name_hint() << "("; for (size_t i = 0; i < t->ndim(); i++) { if (i != 0) { diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 96df28625bec..0bff2dbf75c7 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -398,7 +398,11 @@ class DepTracker : public IRVisitor { public: std::vector findUsedTensors(Tensor* tensor) { used_tensors.clear(); - tensor->body()->accept(this); + if (tensor->body()) { + tensor->body()->accept(this); + } else { + tensor->ElementStmt()->accept(this); + } return used_tensors; } @@ -505,6 +509,11 @@ LoopNest::LoopNest(const std::vector& output_tensors) { Stmt* LoopNest::lowerToStmt(Tensor* t) { Stmt* body = t->ElementStmt(); + // If this Tensor has no functional body, it already has its axes expanded. + if (nullptr == t->body()) { + return body; + } + if (t->ndim() == 0 && t->reduce_ndim() == 0) { return body; } diff --git a/torch/csrc/jit/tensorexpr/tensor.cpp b/torch/csrc/jit/tensorexpr/tensor.cpp index 4afc1ffeefb5..d12f6999c8d5 100644 --- a/torch/csrc/jit/tensorexpr/tensor.cpp +++ b/torch/csrc/jit/tensorexpr/tensor.cpp @@ -86,7 +86,7 @@ Tensor* Compute( return new Tensor(func_name, dims, args_nodes, body); } -Stmt* Tensor::ElementStmt() { +Stmt* Tensor::ElementStmt() const { std::vector indices; for (size_t i = 0; i < buf_->ndim(); i++) { indices.push_back(args_[i]); diff --git a/torch/csrc/jit/tensorexpr/tensor.h b/torch/csrc/jit/tensorexpr/tensor.h index d37f14c3a606..e5e399db348b 100644 --- a/torch/csrc/jit/tensorexpr/tensor.h +++ b/torch/csrc/jit/tensorexpr/tensor.h @@ -12,7 +12,7 @@ namespace torch { namespace jit { namespace tensorexpr { -class Tensor : KernelScopedObject { +class TORCH_API Tensor : KernelScopedObject { public: Tensor( const std::string& name, @@ -27,7 +27,7 @@ class Tensor : KernelScopedObject { : buf_(buf), args_(args), body_(body) {} Tensor( - Buf* buf, + const Buf* buf, const std::vector& args, const std::vector& reduce_dims, const std::vector& reduce_args, @@ -38,6 +38,8 @@ class Tensor : KernelScopedObject { reduce_dims_(reduce_dims), reduce_args_(reduce_args) {} + virtual ~Tensor() {} + // Wrappers over accessors to fields of the underlying function const Expr* body() const { return body_; @@ -94,7 +96,7 @@ class Tensor : KernelScopedObject { const Expr* initializer() const { return initializer_; } - Stmt* ElementStmt(); + virtual Stmt* ElementStmt() const; template inline ExprHandle operator()(const Ts&... ts); @@ -113,6 +115,24 @@ class Tensor : KernelScopedObject { const Expr* initializer_{nullptr}; }; +class TORCH_API CompoundTensor : public Tensor { + public: + CompoundTensor( + const Buf* buf, + const std::vector& args, + Stmt* stmt) + : Tensor(buf, args, {}, {}, nullptr), stmt_(stmt) {} + + virtual ~CompoundTensor() {} + + Stmt* ElementStmt() const override { + return stmt_; + } + + private: + Stmt* stmt_; +}; + class Placeholder { public: Placeholder(const BufHandle& data) : data_(data.node()) { @@ -306,7 +326,7 @@ class FunctionCall : public CallNode { } FunctionCall(Tensor* tensor, const std::vector& params) - : BaseClass(tensor->body()->dtype(), kFunctionCall, params), + : BaseClass(tensor->buf()->dtype(), kFunctionCall, params), tensor_(tensor) {} private: From 159f258415364f73a20ecd2b15b9f2e52ef1b922 Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Thu, 10 Dec 2020 19:47:52 -0800 Subject: [PATCH 160/250] Update Kineto revision (#49200) Summary: Updating to a newer revision Pull Request resolved: https://github.com/pytorch/pytorch/pull/49200 Test Plan: USE_KINETO=1 TORCH_CUDA_ARCH_LIST="6.0;7.0" USE_CUDA=1 USE_MKLDNN=1 BUILD_BINARY=1 python setup.py develop install --cmake python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record Fixes #{issue number} Reviewed By: ngimel Differential Revision: D25480439 Pulled By: ilia-cher fbshipit-source-id: bca1f708f5e4a052028304b918a3adae9324318f --- third_party/kineto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/kineto b/third_party/kineto index bf384310eafa..e9198dd3066e 160000 --- a/third_party/kineto +++ b/third_party/kineto @@ -1 +1 @@ -Subproject commit bf384310eafa674a1cc83be71f52ecb320ccdf84 +Subproject commit e9198dd3066ee6e5e20201d6ae6f86f092bb7123 From cc3b59f6dfe67a65d7d8e50575fac5e070556094 Mon Sep 17 00:00:00 2001 From: Zachary DeVito Date: Thu, 10 Dec 2020 20:59:56 -0800 Subject: [PATCH 161/250] [package] use bazel-style glob matching for mock/extern (#49066) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49066 This PR tweaks mock_module and extern_module. They are now renamed mock and extern, and now only edit the package when a module matching the pattern specified is required through dependency analysis. save_extern_module and save_mock_module are added to explicitly modify the package, but should not be needed by most users of the API unless they are overriding require_package. mock and extern now use bazel-style glob matching rules (https://docs.bazel.build/versions/master/be/functions.html#glob). i.e. `torch.**` matches `torch` and `torch.bar` but not `torchvision`. mock and extern also now take an exclude list to filter out packages that should not apply to the action. Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D25413935 Pulled By: zdevito fbshipit-source-id: 5c06b417bee94ac8e72c13985b5ec42fcbe00817 --- test/test_package.py | 37 +++++---- torch/package/exporter.py | 154 ++++++++++++++++++++++---------------- 2 files changed, 110 insertions(+), 81 deletions(-) diff --git a/test/test_package.py b/test/test_package.py index 894ec8783f1b..ee9140d661b9 100644 --- a/test/test_package.py +++ b/test/test_package.py @@ -119,7 +119,9 @@ def test_resources(self): def test_extern(self): filename = self.temp() with PackageExporter(filename, verbose=False) as he: - he.extern_modules(['package_a.subpackage', 'module_a']) + he.extern(['package_a.subpackage', 'module_a']) + he.require_module('package_a.subpackage') + he.require_module('module_a') he.save_module('package_a') hi = PackageImporter(filename) import package_a.subpackage @@ -136,7 +138,7 @@ def test_extern(self): def test_extern_glob(self): filename = self.temp() with PackageExporter(filename, verbose=False) as he: - he.extern_modules(['package_a.*', 'module_*']) + he.extern(['package_a.*', 'module_*']) he.save_module('package_a') he.save_source_string('test_module', """\ import package_a.subpackage @@ -158,8 +160,10 @@ def test_extern_glob(self): def test_mock(self): filename = self.temp() with PackageExporter(filename, verbose=False) as he: - he.mock_modules(['package_a.subpackage', 'module_a']) + he.mock(['package_a.subpackage', 'module_a']) he.save_module('package_a') + he.require_module('package_a.subpackage') + he.require_module('module_a') hi = PackageImporter(filename) import package_a.subpackage _ = package_a.subpackage @@ -175,7 +179,7 @@ def test_mock(self): def test_mock_glob(self): filename = self.temp() with PackageExporter(filename, verbose=False) as he: - he.mock_modules(['package_a.*', 'module*']) + he.mock(['package_a.*', 'module*']) he.save_module('package_a') he.save_source_string('test_module', """\ import package_a.subpackage @@ -199,7 +203,7 @@ def test_custom_requires(self): class Custom(PackageExporter): def require_module(self, name, dependencies): if name == 'module_a': - self.mock_module('module_a') + self.save_mock_module('module_a') elif name == 'package_a': self.save_source_string('package_a', 'import module_a\nresult = 5\n') else: @@ -354,19 +358,22 @@ def load(): self.assertTrue(torch.allclose(*results)) def test_module_glob(self): - from torch.package.exporter import _module_glob_to_re + from torch.package.exporter import _GlobGroup - def check(pattern, should_match, should_not_match): - x = _module_glob_to_re(pattern) + def check(include, exclude, should_match, should_not_match): + x = _GlobGroup(include, exclude) for e in should_match: - self.assertTrue(x.fullmatch(e)) + self.assertTrue(x.matches(e)) for e in should_not_match: - self.assertFalse(x.fullmatch(e)) - - check('torch.*', ['torch.foo', 'torch.bar'], ['tor.foo', 'torch.foo.bar', 'torch']) - check('torch.**', ['torch.foo', 'torch.bar', 'torch.foo.bar'], ['tor.foo', 'torch']) - check('torch.*.foo', ['torch.w.foo'], ['torch.hi.bar.baz']) - check('torch.**.foo', ['torch.w.foo', 'torch.hi.bar.foo'], ['torch.f.foo.z']) + self.assertFalse(x.matches(e)) + + check('torch.*', [], ['torch.foo', 'torch.bar'], ['tor.foo', 'torch.foo.bar', 'torch']) + check('torch.**', [], ['torch.foo', 'torch.bar', 'torch.foo.bar', 'torch'], ['what.torch', 'torchvision']) + check('torch.*.foo', [], ['torch.w.foo'], ['torch.hi.bar.baz']) + check('torch.**.foo', [], ['torch.w.foo', 'torch.hi.bar.foo'], ['torch.f.foo.z']) + check('torch*', [], ['torch', 'torchvision'], ['torch.f']) + check('torch.**', ['torch.**.foo'], ['torch', 'torch.bar', 'torch.barfoo'], ['torch.foo', 'torch.some.foo']) + check('**.torch', [], ['torch', 'bar.torch'], ['visiontorch']) if __name__ == '__main__': main() diff --git a/torch/package/exporter.py b/torch/package/exporter.py index b0d1f5472e16..11404039d546 100644 --- a/torch/package/exporter.py +++ b/torch/package/exporter.py @@ -1,5 +1,5 @@ import torch -from torch.serialization import normalize_storage_type, location_tag, _should_read_directly +from torch.serialization import normalize_storage_type, location_tag import io import pickletools from .find_file_dependencies import find_files_source_depends_on @@ -7,7 +7,7 @@ from ._importlib import _normalize_path import types import importlib -from typing import List, Any, Callable, Dict, Tuple +from typing import List, Any, Callable, Dict, Tuple, Union, Iterable from distutils.sysconfig import get_python_lib from pathlib import Path import linecache @@ -211,7 +211,7 @@ def require_module(self, module_name: str, dependencies=True): of modules""" for pattern, action in self.patterns: - if pattern.fullmatch(module_name): + if pattern.matches(module_name): action(module_name) return @@ -220,7 +220,7 @@ def require_module(self, module_name: str, dependencies=True): if self.verbose: print(f'implicitly adding {root_name} to external modules ' f'since it is part of the standard library and is a dependency.') - self.extern_module(root_name) + self.save_extern_module(root_name) return self.save_module(module_name, dependencies) @@ -303,70 +303,64 @@ def save_binary(self, package, resource, binary: bytes): filename = self._filename(package, resource) self._write(filename, binary) - def extern_module(self, module_name: str): + def mock(self, include: 'GlobPattern', *, exclude: 'GlobPattern' = ()): + """Replace some required modules with a mock implementation. Mocked modules will return a fake + object for any attribute accessed from it. Because we copy file-by-file, the dependency resolution will sometimes + find files that are imported by model files but whose functionality is never used + (e.g. custom serialization code or training helpers). + Use this function to mock this functionality out without having to modify the original code. + + Args: + include (Union[List[str], str]): A string e.g. "my_package.my_subpackage", or list of strings + for the names of the modules to be mocked out. Strings can also be a glob-style pattern + string that may match multiple modules. Any required dependencies that match this pattern + string will be mocked out automatically. + + Examples: + 'torch.**' -- matches torch and all submodules of torch, e.g. 'torch.nn' and torch.nn.functional' + 'torch.*' -- matches 'torch.nn' or 'torch.functional', but not 'torch.nn.functional' + + exclude (Union[List[str], str]): An optional pattern that excludes some patterns that match the include string. + e.g. include='torch.**', exclude='torch.foo' will mock all torch packages except 'torch.foo' Default: [] + + """ + self.patterns.append((_GlobGroup(include, exclude), self.save_mock_module)) + + def extern(self, include: 'GlobPattern', *, exclude: 'GlobPattern' = ()): """Include `module` in the list of external modules the package can import. This will prevent dependency discover from saving it in the package. The importer will load an external module directly from the standard import system. Code for extern modules must also exist in the process loading the package. Args: - module_name (str): e.g. "my_package.my_subpackage" the name of the external module. - This can also be a glob-style pattern, as described in :meth:`mock_module` - """ - if self._add_if_pattern(module_name, self.extern_module): - return - - if module_name not in self.external: - self.external.append(module_name) + include (Union[List[str], str]): A string e.g. "my_package.my_subpackage", or list of strings + for the names of the modules to be externed. This can also be a glob-style pattern, as described in :meth:`mock` - def extern_modules(self, module_names: List[str]): - """Extern a list of modules. Convience wrapper for calling :meth:`extern_module` on many items. + exclude (Union[List[str], str]): An optional pattern that excludes some patterns that match the include string. - Args: - module_names (List[str]): List of module names """ - for m in module_names: - self.extern_module(m) + self.patterns.append((_GlobGroup(include, exclude), self.save_extern_module)) - def mock_module(self, module_name: str): - """Replace the code for `module_name` in the package with a fake implementation. This module will return a fake - object for any attribute accessed from it. Because we copy file-by-file, the dependency resolution will sometimes - find files that are imported by model files but whose functionality is never used - (e.g. custom serialization code or training helpers). - Use this function to mock this functionality out without having to modify the original code. + def save_extern_module(self, module_name: str): + """Add `module_name` to the list of external modules, regardless of whether it is + required by other modules. - Args: - module_name (str): e.g. "my_package.my_subpackage" the name of the module to be mocked out. - The module_name can also be a glob-style pattern string that may match multiple modules. - Any required dependencies that match this pattern string will be mocked out automatically. - Examples: - 'torch.**' -- matches all submodules of torch, e.g. 'torch.nn' and torch.nn.functional' - 'torch.*' -- matches 'torch.nn' or 'torch.functional', but not 'torch.nn.functional' + Prefer using `extern` to only mark modules extern if they are actually required by the packaged code. """ - if self._add_if_pattern(module_name, self.mock_module): - return + if module_name not in self.external: + self.external.append(module_name) + def save_mock_module(self, module_name: str): + """Add `module_name` to the package, implemented it with a mocked out version that + can be imported but does not include any implementations. + + Prefer using `mock` to only include this module if it is required by other modules. + """ if '_mock' not in self.provided: self.save_source_file('_mock', str(Path(__file__).parent / '_mock.py'), dependencies=False) is_package = hasattr(self._import_module(module_name), '__path__') self.save_source_string(module_name, _MOCK_IMPL, is_package, dependencies=False) - - def mock_modules(self, module_names): - """Mock a list of modules. Convience wrapper for calling :meth:`mock_module` on many items. - - Args: - module_names (List[str]): List of module names - """ - for module_name in module_names: - self.mock_module(module_name) - - def _add_if_pattern(self, potential_pattern: str, action: Callable[[str], None]): - if '*' in potential_pattern or '?' in potential_pattern: - self.patterns.append((_module_glob_to_re(potential_pattern), action)) - return True - return False - def _module_is_already_provided(self, qualified_name: str) -> bool: for mod in self.external: if qualified_name == mod or qualified_name.startswith(mod + '.'): @@ -417,16 +411,12 @@ def close(self): for key in sorted(self.serialized_storages.keys()): name = 'data/{}'.format(key) storage = self.serialized_storages[key] - if storage.device.type == 'cpu': - # If it's on the CPU we can directly copy it into the zip file - num_bytes = storage.size() * storage.element_size() - self.zip_file.write_record(name, storage.data_ptr(), num_bytes) - else: - # Copy to a buffer, then serialize that - buf = io.BytesIO() - storage._write_file(buf, _should_read_directly(buf)) - buf_value = buf.getvalue() - self._write(name, buf_value) + # location information is saved in python, but to actually + # get the data from non cpu tensors we need to move them over first + if storage.device.type != 'cpu': + storage = storage.cpu() + num_bytes = storage.size() * storage.element_size() + self.zip_file.write_record(name, storage.data_ptr(), num_bytes) contents = ('\n'.join(self.external) + '\n') self._write('extern_modules', contents) del self.zip_file @@ -441,7 +431,6 @@ def _can_implicitly_extern(self, module_name: str): return module_name == 'torch' or (module_name not in _DISALLOWED_MODULES and _is_builtin_or_stdlib_module(self._import_module(module_name))) - # even though these are in the standard library, we do not allow them to be # automatically externed since they offer a lot of system level access _DISALLOWED_MODULES = ['sys', 'io'] @@ -471,8 +460,41 @@ def _read_file(filename: str) -> str: b = f.read() return b.decode('utf-8') -_glob_re_filter = {'**': '.*', '*': '[^.]*', '?': '.', '.': '\\.'} -_glob_split = re.compile(f'({"|".join(re.escape(x) for x in _glob_re_filter.keys())})') -def _module_glob_to_re(module_name): - pattern = ''.join(_glob_re_filter.get(x, x) for x in _glob_split.split(module_name)) - return re.compile(pattern) +GlobPattern = Union[str, Iterable[str]] + + +class _GlobGroup: + def __init__(self, include: 'GlobPattern', exclude: 'GlobPattern'): + self._dbg = f'_GlobGroup(include={include}, exclude={exclude})' + self.include = _GlobGroup._glob_list(include) + self.exclude = _GlobGroup._glob_list(exclude) + + def __str__(self): + return self._dbg + + def matches(self, candidate: str) -> bool: + candidate = '.' + candidate + return any(p.fullmatch(candidate) for p in self.include) and all(not p.fullmatch(candidate) for p in self.exclude) + + @staticmethod + def _glob_list(elems: 'GlobPattern'): + if isinstance(elems, str): + return [_GlobGroup._glob_to_re(elems)] + else: + return [_GlobGroup._glob_to_re(e) for e in elems] + + @staticmethod + def _glob_to_re(pattern: str): + # to avoid corner cases for the first component, we prefix the candidate string + # with '.' so `import torch` will regex against `.torch` + def component_to_re(component): + if '**' in component: + if component == '**': + return '(\\.[^.]+)*' + else: + raise ValueError('** can only appear as an entire path segment') + else: + return '\\.' + '[^.]*'.join(re.escape(x) for x in component.split('*')) + + result = ''.join(component_to_re(c) for c in pattern.split('.')) + return re.compile(result) From 696e30af6e004f6346c0070770ce242a1a3b133a Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Thu, 10 Dec 2020 21:07:39 -0800 Subject: [PATCH 162/250] Fix ProcessGroupNCCL profiling when profiler is not run with use_cuda (#48946) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48946 Move recordFunctionEndCallback to after the blocking portion of launching the NCCL kernel, and remove addCallback since it runs the lambda inline anyways, and triggers unnecessary CUDA stream logic. If we want CUDA operations such as NCCL kernels accurately profiled, we should use the profiler with use_cuda=True. However, we are currently debugging a deadlock for the use_cuda=True case, fix is being tracked in #48987. To ensure that the tests are no longer flaky, submitted this PR to ci-all: #48947 and ran the test a bunch of times ssh'd into the CI machine. ghstack-source-id: 118330130 Test Plan: Ci Reviewed By: mrzzd Differential Revision: D25368322 fbshipit-source-id: 7d17036248a3dcd855e58addc383bba64d6bc391 --- torch/lib/c10d/ProcessGroupNCCL.cpp | 20 +++++++------- .../_internal/distributed/distributed_test.py | 27 +++++++++---------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index d9e33ddc33c9..5152ce01e25e 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1049,15 +1049,6 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( // Store references to outputs to be used by WorkNCCL::getFuture. work->outputs_ = std::make_shared>(outputs); - if (work->recordFunctionEndCallback_) { - // recordFunctionEndCallback_ is normally called in fininsh() function by - // base class, but since finish is not called by WorkNCCL, we run this - // function now. - // Note when can_profile is false, profilingTitle is not provided and so, - // recordFunctionEndCallback_ is not set. - work->recordFunctionEndCallback_(); - } - at::cuda::OptionalCUDAGuard gpuGuard; pre(ncclStreams_[key]); @@ -1102,6 +1093,17 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( work->opTimeout_ = opTimeout_; work->store_ = store_; + if (work->recordFunctionEndCallback_) { + // recordFunctionEndCallback_ is normally called in fininsh() function by + // base class, but since finish is not called by WorkNCCL, we schedule this + // function to be run when work is done. Note that addCallback() onto the + // Work's futureNCCL is not useful here, as it would just run the callback + // inline. + // Note when can_profile is false, profilingTitle is not provided and so, + // recordFunctionEndCallback_ is not set. + work->recordFunctionEndCallback_(); + } + if (asyncErrorHandling_) { workEnqueue(work); } diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index cbe8e9d630bf..5577d2322679 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -67,6 +67,12 @@ def __eq__(self, other): [1, 2, True, "string", [4, 5, "nested"]], ] +# Allowlist of distributed backends where profiling collectives is supported. +PROFILING_SUPPORTED_BACKENDS = [ + dist.Backend.NCCL, + dist.Backend.GLOO, +] + # Dummy NamedTuple data structures to test DDP support for NamedTuple types. EXPECTED_FIELDS = ("a", "b") TestNamedTupleInput_0 = namedtuple("NamedTuple", EXPECTED_FIELDS) @@ -1283,7 +1289,7 @@ def test_all_reduce_result_cuda(self): self.assertEqual(result, [_build_tensor(src + 1, expected_value)]) self._barrier() - def call_dist_op(self, profiling_title_postfix, is_async, op, *args, expect_event=False, secondary_op_call=None, **kwargs): + def call_dist_op(self, profiling_title_postfix, is_async, op, *args, expect_event=True, secondary_op_call=None, **kwargs): op_calls = [lambda: op(*args, **kwargs)] if secondary_op_call is not None: op_calls.append(secondary_op_call) @@ -1297,12 +1303,12 @@ def call_dist_op(self, profiling_title_postfix, is_async, op, *args, expect_even def get_event(postfix): return [event for event in prof.function_events if event.name.endswith(postfix)] - if expect_event: + if expect_event and dist.get_backend() in PROFILING_SUPPORTED_BACKENDS: events = get_event(profiling_title_postfix) self.assertEqual(len(events), len(op_calls)) for e in events: self.assertEqual(e.count, 1) - self.assertGreater(e.cpu_time, 0) + self.assertGreaterEqual(e.cpu_time, 0) # ALL REDUCE def _test_all_reduce_helper( @@ -2474,12 +2480,9 @@ def _test_reduce_multigpu_helper( _build_tensor(src + 1, master_value).cuda(device=i) for i in rank_to_GPU[rank] ] - # TODO: Setting expect_event=False to disable profiling - # tests. Once https://github.com/pytorch/pytorch/issues/48127 - # is addressed, this should be reverted. self.call_dist_op( "reduce", False, dist.reduce_multigpu, tensors, src, op, group_id, - expect_event=False) + expect_event=len(tensors) == 1) expected_tensor = _build_tensor(src + 1, expected_value) self.assertEqual(tensors[0], expected_tensor) else: @@ -2487,12 +2490,9 @@ def _test_reduce_multigpu_helper( _build_tensor(src + 1, worker_value).cuda(device=i) for i in rank_to_GPU[rank] ] - # TODO: Setting expect_event=False to disable profiling - # tests. Once https://github.com/pytorch/pytorch/issues/48127 - # is addressed, this should be reverted. self.call_dist_op( "reduce", False, dist.reduce_multigpu, tensors, src, op, group_id, - expect_event=False) + expect_event=len(tensors) == 1) self._barrier() @@ -2532,13 +2532,10 @@ def _test_all_gather_multigpu_helper(self, group, group_id, rank, rank_to_GPU, d for gpu in rank_to_GPU[rank]: output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu]) expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu]) - # TODO: Setting expect_event=False to disable profiling - # tests. Once https://github.com/pytorch/pytorch/issues/48127 - # is addressed, this should be reverted. self.call_dist_op( "all_gather", False, dist.all_gather_multigpu, output_tensors, tensors, group_id, - expect_event=False) + expect_event=len(expected_output) == 1) self.assertEqual(output_tensors, expected_output) self._barrier() From 743a4ef0aebf287ceab82da91fea4722cd48dfb3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Thu, 10 Dec 2020 21:49:53 -0800 Subject: [PATCH 163/250] [PyTorch] Enable AutoNonVariableTypeMode in static runtime (#49199) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49199 This should save us an extra round of dispatch for resize_, resize_as_, detach_, and copy_, at the cost of disabling profiling and tracing. I'm told that static runtime has its own per-op profiling and we don't need tracing. ghstack-source-id: 118348314 Test Plan: Code review to confirm lack of need for profiling & tracing, and that there isn't a different switch we should be using instead. Internal benchmarks -- seeing 11-12% improvement in overall runtime Reviewed By: hlu1 Differential Revision: D25476819 fbshipit-source-id: 71e2c919b386b25c41084e2e4a54fe765a4f8f22 --- torch/csrc/jit/runtime/static/impl.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index b95263f71e81..ffa2ab4f7ec4 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -424,6 +425,12 @@ std::vector StaticRuntime::run( c10::IValue StaticRuntime::run( const std::vector& args, const std::unordered_map& kwargs) { + // We assume inference workloads, so we do not need + // autograd. Enabling this is a significant win on dispatcher + // overhead because it saves a round of dispatch for at least some + // functions, such as resize_ and resize_as_. + at::AutoNonVariableTypeMode non_var_type_mode(true); + if (planner_) { planner_->allocate(); } @@ -535,6 +542,10 @@ StaticRuntime::IndividualMetrics StaticRuntime::benchmark_individual_ops( const int main_runs) { TORCH_CHECK(warmup_runs >= 0 && main_runs >= 1); + // See comment on above use of AutoNonVariableTypeMode for + // explanation. + at::AutoNonVariableTypeMode non_var_type_mode(true); + IndividualMetrics results; results.total_time = 0.0; results.time_per_node.resize(nodes_.size(), 0); From 59e822026c4ddbbdc26266f0fccdd14393098c42 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 10 Dec 2020 21:49:53 -0800 Subject: [PATCH 164/250] Add manual_cpp_binding to native_functions.yaml (#49092) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49092 Functions which specify manual_cpp_binding don't automatically get C++ bindings generated for them in TensorBody.h or Functions.h. This lets end users manually define the bindings themselves, which may be helpful if there is a way to short circuit the dispatcher entirely. contiguous() is switched to use this mechanism. Although manual_cpp_binding suggests that we don't generate the binding at all, it is often the case that there is some "fast path", but when this path is not satisfied, we should go back to the slow dispatch. So we still generate a fallback method/function which the user-defined binding can call into in case that we have to go slowpath. The correctness conditions for bindings manually written in this way are subtle. Here are the ones I can think of off the top of my head: - Whatever condition is tested in the C++ body, must ALSO be tested again in the native:: implementation on the other side of the dispatcher. This is because you are NOT GUARANTEED to hit the native:: implementation through the C++ binding, you may go straight to the implementation via a boxed call. - If a binding is written in this way, it is only safe to skip dispatch if you would have returned the same tensor as before. In any situation you would return a fresh tensor, you MUST go to the slow path, because you need to actually get to the autograd kernel. Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: bhosmer Differential Revision: D25428440 Pulled By: swolchok fbshipit-source-id: 6e71767cb8d1086d56cd827c1d2d56cac8f6f5fe --- aten/src/ATen/native/native_functions.yaml | 1 + aten/src/ATen/templates/TensorBody.h | 8 ++++++++ tools/codegen/api/types.py | 13 +++++++++---- tools/codegen/gen.py | 6 +++--- tools/codegen/model.py | 12 +++++++++++- 5 files changed, 32 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index acb424fcc28a..a5e0e9e1e2f4 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1119,6 +1119,7 @@ - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) use_c10_dispatcher: full variants: method + manual_cpp_binding: True - func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index e79957e5ca6e..850856b335ac 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -115,6 +115,14 @@ class CAFFE2_API Tensor { return impl_->storage_offset(); } + Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const { + if (is_contiguous(memory_format)) { + return *this; + } else { + return __dispatch_contiguous(memory_format); + } + } + TensorImpl * unsafeGetTensorImpl() const { return impl_.get(); } diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index 347c4876a73a..55a6e4abc52c 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -157,6 +157,8 @@ class CppSignature: # (i.e. with a potential TensorOptions argument and out arguments in the front) faithful: bool + fallback_binding: bool = False + # Return the unpacked argument structure of this signature, # discarding information about which arguments are semantically # related to each other. @@ -190,7 +192,10 @@ def argument_packs(self) -> Sequence[CppArgumentPack]: return argument_packs def name(self) -> str: - return cpp.name(self.func, faithful_name_for_out_overloads=self.faithful) + n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful) + if self.fallback_binding: + n = f"__dispatch_{n}" + return n # Render the C++ declaration for this signature def decl(self) -> str: @@ -218,13 +223,13 @@ class CppSignatureGroup: faithful_signature: Optional[CppSignature] @staticmethod - def from_schema(func: FunctionSchema, *, method: bool) -> 'CppSignatureGroup': + def from_schema(func: FunctionSchema, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup': faithful_signature: Optional[CppSignature] if func.arguments.tensor_options is not None or len(func.arguments.out) > 0: - faithful_signature = CppSignature(func=func, faithful=True, method=method) + faithful_signature = CppSignature(func=func, faithful=True, method=method, fallback_binding=fallback_binding) else: faithful_signature = None - signature = CppSignature(func=func, faithful=False, method=method) + signature = CppSignature(func=func, faithful=False, method=method, fallback_binding=fallback_binding) return CppSignatureGroup( func=func, signature=signature, diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 14bbadce1570..025df25b3577 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -573,7 +573,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]: name = cpp.name(f.func) - sig_group = CppSignatureGroup.from_schema(f.func, method=False) + sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=f.manual_cpp_binding) if self.target is Target.DECLARATION: result = f"CAFFE2_API {sig_group.signature.decl()};\n" @@ -627,7 +627,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]: name = cpp.name(f.func) - sig_group = CppSignatureGroup.from_schema(f.func, method=True) + sig_group = CppSignatureGroup.from_schema(f.func, method=True, fallback_binding=f.manual_cpp_binding) if self.target is Target.DECLARATION: result = f"{sig_group.signature.decl()} const;\n" @@ -998,7 +998,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object: kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only) out_arg_set = set(a.name for a in f.func.arguments.out) - sig_group = CppSignatureGroup.from_schema(f.func, method=False) + sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=False) cpp_args = sig_group.signature.arguments() arguments = [ compute_cpp_argument_yaml( diff --git a/tools/codegen/model.py b/tools/codegen/model.py index c0fc94570d94..0a2689860a17 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -98,6 +98,12 @@ class NativeFunction: # registrations don't participate in codegen-based selective build! manual_kernel_registration: bool + # Whether or not to skip generating TensorMethod/Functions bindings + # for this kernel. Technically, this doesn't actually skip generating + # the binding; instead, the binding gets generated to __dispatch_{funcname} + # so you can make use of the normal binding if you need it. + manual_cpp_binding: bool + # A mapping of dispatch keys to names of functions implementing # them. In native_functions.yaml, the dispatch entry is optional; in that # case, that is equivalent to having written: @@ -188,6 +194,9 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': manual_kernel_registration = e.pop('manual_kernel_registration', False) assert isinstance(manual_kernel_registration, bool), f'not a bool: {manual_kernel_registration}' + manual_cpp_binding = e.pop('manual_cpp_binding', False) + assert isinstance(manual_cpp_binding, bool), f'not a bool: {manual_cpp_binding}' + device_guard = e.pop('device_guard', True) assert isinstance(device_guard, bool), f'not a bool: {device_guard}' @@ -240,6 +249,7 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': structured_delegate=structured_delegate, structured_inherits=structured_inherits, manual_kernel_registration=manual_kernel_registration, + manual_cpp_binding=manual_cpp_binding, python_module=python_module, category_override=category_override, dispatch=dispatch, @@ -1019,7 +1029,7 @@ def parse(args: str) -> 'Arguments': Input: 'int x, int y, int z' """ - # We do this in two phases. First we parse into three + # We do this in two phases. First we parse into three # main categories: positional, kwarg_only, out. # Then, we reparse positional and kwarg_only to separate # out the self argument and tensor options arguments. From da6f249a10c9c6e36d1f95aa283381efac932682 Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Thu, 10 Dec 2020 21:51:38 -0800 Subject: [PATCH 165/250] [caffe2] DeserializeToNDArray (#49135) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49135 Differential Revision: D25417845 fbshipit-source-id: 4d8efd440bc2577fb717f911a401e7b81d48b907 --- caffe2/core/blob_serialization.cc | 19 ++++--------------- caffe2/core/blob_serialization.h | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc index dd422c5b44cc..fcf08eebfa8a 100644 --- a/caffe2/core/blob_serialization.cc +++ b/caffe2/core/blob_serialization.cc @@ -26,17 +26,6 @@ C10_DEFINE_bool( false, "Serialize BOOL, UINT8, INT8, UINT16, INT16, INT64, FLOAT16 tensors using byte_data field instead of int32"); -#ifdef _MSC_VER -// It's MSVC, so we just have to guess ... and allow an override -#ifdef FOLLY_ENDIAN_BE -constexpr auto kIsLittleEndian = false; -#else -constexpr auto kIsLittleEndian = true; -#endif -#else -constexpr auto kIsLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; -#endif - namespace caffe2 { /** * @brief StringSerializer is the serializer for String. @@ -420,7 +409,7 @@ void DeserializeBlob(const BlobProto& blob_proto, Blob* result) { // === Local helper functions === // Get dimensions from Tensor proto -static std::vector DimsFromTensorProto(const TensorProto& proto) { +std::vector DimsFromTensorProto(const TensorProto& proto) { std::vector dims; dims.reserve(proto.dims().size()); for (const int64_t d : proto.dims()) { @@ -430,7 +419,7 @@ static std::vector DimsFromTensorProto(const TensorProto& proto) { } // Get number of elements from Tensor proto -static int64_t NumelFromTensorProto(const TensorProto& tensor_proto) { +int64_t NumelFromTensorProto(const TensorProto& tensor_proto) { int64_t numel = 1; for (const int64_t d : tensor_proto.dims()) { numel *= d; @@ -439,7 +428,7 @@ static int64_t NumelFromTensorProto(const TensorProto& tensor_proto) { } // Get data type from Tensor proto -static TypeMeta GetDataType(const TensorProto& tensor_proto) { +TypeMeta GetDataType(const TensorProto& tensor_proto) { TypeMeta dtype; if (tensor_proto.data_type() != TensorProto_DataType_UNDEFINED) { dtype = DataTypeToTypeMeta(tensor_proto.data_type()); @@ -459,7 +448,7 @@ static at::TensorOptions TensorOptionsFromProto( .device(OptionToDevice(tensor_proto.device_detail())); } -static std::unique_ptr ContextFromProto( +std::unique_ptr ContextFromProto( const TensorProto& tensor_proto) { auto device = OptionToDevice(tensor_proto.device_detail()); return CreateContext(device); diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h index 5309314af0c7..72d148c86775 100644 --- a/caffe2/core/blob_serialization.h +++ b/caffe2/core/blob_serialization.h @@ -17,6 +17,17 @@ C10_DECLARE_int(caffe2_tensor_chunk_size); C10_DECLARE_int(caffe2_max_tensor_serializer_threads); C10_DECLARE_bool(caffe2_serialize_fp16_as_bytes); +#ifdef _MSC_VER +// It's MSVC, so we just have to guess ... and allow an override +#ifdef FOLLY_ENDIAN_BE +constexpr auto kIsLittleEndian = false; +#else +constexpr auto kIsLittleEndian = true; +#endif +#else +constexpr auto kIsLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; +#endif + namespace caffe2 { constexpr auto kTensorBlobType = "Tensor"; @@ -239,6 +250,14 @@ inline std::string SerializeBlobProtoAsString_EnforceCheck( return SerializeAsString_EnforceCheck(blob, blob.name().c_str()); } +int64_t NumelFromTensorProto(const TensorProto& tensor_proto); + +std::vector DimsFromTensorProto(const TensorProto& proto); + +TypeMeta GetDataType(const TensorProto& tensor_proto); + +std::unique_ptr ContextFromProto(const TensorProto& tensor_proto); + } // namespace caffe2 #endif // CAFFE2_CORE_BLOB_SERIALIZATION_H_ From 56a157fc797f54f0b8d7322c0628d32a38712316 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Thu, 10 Dec 2020 23:27:01 -0800 Subject: [PATCH 166/250] hacky_wrapper_for_legacy_signatures reorders out arguments (#48911) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48911 This enables us to use hacky_wrapper_for_legacy_signatures for ops with out arguments so they can use templated unboxing logic without having to be rewritten. This only actually enables it for one op as a proof of concept. There will be a separate PR enabling it for more ops. ghstack-source-id: 118379659 Test Plan: waitforsandcastle Reviewed By: bhosmer Differential Revision: D25363336 fbshipit-source-id: da075d2cc58814f886a25d52652511dbbe990cec --- .../hacky_wrapper_for_legacy_signatures.h | 104 +++++++++++++++++- aten/src/ATen/native/native_functions.yaml | 1 + c10/test/util/Metaprogramming_test.cpp | 18 +++ c10/util/Metaprogramming.h | 25 +++++ tools/codegen/gen.py | 23 +++- 5 files changed, 163 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h b/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h index ea7a5bd0b54c..50c90937548f 100644 --- a/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h +++ b/aten/src/ATen/core/op_registration/hacky_wrapper_for_legacy_signatures.h @@ -207,12 +207,110 @@ constexpr auto with_explicit_optional_tensors(KernelFunc kernel_func) { return kernel_func; } +template constexpr bool is_out_argument_() { + return std::is_same::value; } +template using is_out_argument = guts::bool_constant()>; -template +template +struct with_out_arguments_reordered_impl final { +private: + // For an example op + // > aten::example(Tensor a, int64_t b, int64_t c, Tensor(a!) out_d, Tensor(b!) out_e) -> (Tensor(a!), Tensor(b!)) + // we get a KernelFunc + // > KernelFunc = std::tuple example(Tensor& out_d, Tensor& out_e, const Tensor& a, int64_t b, int64_t c) + // > NumOutParameters = 2 + // with the out arguments at the front, and reorder that into + // > std::tuple example(const Tensor& a, int64_t b, int64_t c, Tensor& out_d, Tensor& out_e) + // where the out arguments are in the back. + + using kernel_signature_traits = guts::infer_function_traits_t; + + // Assert that the KernelFunc is what we expect. The following block is + // not strictly necessary for the metaprogramming here, it's just a check. + static_assert( + guts::typelist::all< + is_out_argument, + guts::typelist::take_t< + typename kernel_signature_traits::parameter_types, + NumOutParameters + > + >::value, + "The kernel function has the wrong number of leading Tensor& arguments to match the out arguments in the JIT signature" + ); + + static constexpr size_t num_parameters = kernel_signature_traits::number_of_parameters; + static constexpr size_t num_nonout_parameters = num_parameters - NumOutParameters; + + // kernel_to_schema_permutation_indices contains a mapping from argument index in KernelFunc to the corresponding + // argument index in the schema. + // For the aten::example op, that'll be + // > kernel_to_schema_permutation_indices = [3, 4, 0, 1, 2] + // Interpreted as a mapping, this means + // - argument 0 in KernelFunc maps to argument 3 in the schema, + // - argument 1 in KernelFunc maps to argument 4 in the schema, + // - argument 2 in KernelFunc maps to argument 0 in the schema, + // - ... + // We can use this as a permutation function to reorder types or values correspondingly + using kernel_to_schema_permutation_indices = guts::concat_iseq_t< + guts::make_offset_index_sequence, + std::make_index_sequence + >; + + // For types, we need the inverse permutation because parameters (i.e. types) and arguments (i.e. values) + // need to be mapped in inverted directions. For types, we generate the schema order types from + // the KernelFunction types, but for arguments we get schema order arguments and need to generate + // the KernelFunction arguments. + // That's why in this reordering, we use NumOutParameters instead of the num_nonout_parameters we used above. + using target_signature_parameters = guts::typelist::concat_t< + guts::typelist::drop_t, + guts::typelist::take_t + >; + + template + struct wrapper_; + template + struct wrapper_, std::index_sequence> { + static Return call(Parameters... args) { + // call through to KernelFunc but reorder arguments as determined + // by the permutation we calculated above. + return (*KernelFunc::func_ptr())(std::get(std::tuple(std::forward(args)...))...); + } + }; + +public: + using wrapper = wrapper_; +}; + + +/** + * Take a kernel function that has a number of `Tensor`, `const Tensor&` or `Tensor&` arguments + * where all `Tensor&` arguments are at the beginning, and take NumOutParameters. + * Create a wrapper function that has `NumOutParameters` `Tensor&` arguments at the end + * and calls through the underlying kernel function by reordering them to the front. + */ +template 0), int> = 0> +constexpr auto with_out_arguments_reordered(KernelFunc kernel_func) { + // SFINAE case for kernels that have out tensor arguments. + // Wrap them and reorder the arguments. + using impl = with_out_arguments_reordered_impl; + return TORCH_FN((&impl::wrapper::call)); +} + +template = 0> +constexpr auto with_out_arguments_reordered(KernelFunc kernel_func) { + // SFINAE case for kernels that don't have out tensor arguments. + // Don't wrap them but just use the kernel directly. + return kernel_func; +} + +} + +template constexpr auto hacky_wrapper_for_legacy_signatures(FuncPtr kernel_func) { - auto with_tensoroptions_scattered = detail::with_scattered_tensor_options(kernel_func); - auto result = detail::with_explicit_optional_tensors(with_tensoroptions_scattered); + auto with_scattered_tensor_options = detail::with_scattered_tensor_options(kernel_func); + auto with_out_arguments_reordered = detail::with_out_arguments_reordered(with_scattered_tensor_options); + auto result = detail::with_explicit_optional_tensors(with_out_arguments_reordered); static_assert(std::is_same::value, "Generated signature doesn't match the expected one."); return result; }; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index a5e0e9e1e2f4..43511e16b054 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -230,6 +230,7 @@ DefaultBackend: abs_ - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures dispatch: CPU, CUDA: abs_out diff --git a/c10/test/util/Metaprogramming_test.cpp b/c10/test/util/Metaprogramming_test.cpp index 88c8e0facad1..63613980079d 100644 --- a/c10/test/util/Metaprogramming_test.cpp +++ b/c10/test/util/Metaprogramming_test.cpp @@ -476,4 +476,22 @@ namespace test_tuple_concat { } } +namespace test_concat_iseq { + using std::index_sequence; + using std::integer_sequence; + static_assert(std::is_same, concat_iseq_t<>>::value, ""); + static_assert(std::is_same, concat_iseq_t>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<>>>::value, ""); + static_assert(std::is_same, concat_iseq_t>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<>>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<4>>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<4>, index_sequence<>>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<2>>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<4, 2>, index_sequence<>>>::value, ""); + static_assert(std::is_same, concat_iseq_t, index_sequence<4, 2>, index_sequence<9>>>::value, ""); + + static_assert(std::is_same, concat_iseq_t, integer_sequence>>::value, ""); +} + + } diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h index ae929a93ca09..a56b43afa852 100644 --- a/c10/util/Metaprogramming.h +++ b/c10/util/Metaprogramming.h @@ -309,4 +309,29 @@ template } +/** + * Concatenate multiple integer sequences + * Example: + * concat_iseq_t, std::index_sequence<4, 2>, std::index_sequence<5>> + * == std::index_sequence<2, 5, 3, 4, 2, 5> + */ +template struct concat_iseq { + static_assert(false_t::value, "In concat_iseq, the T arguments each must be std::integer_sequence<...> with the same IntType."); +}; +template<> +struct concat_iseq<> { + using type = std::index_sequence<>; +}; +template +struct concat_iseq> { + using type = std::integer_sequence; +}; +template +struct concat_iseq, std::integer_sequence, TailISeqs...> { + using type = typename concat_iseq, TailISeqs...>::type; +}; +template +using concat_iseq_t = typename concat_iseq::type; + + }} diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 025df25b3577..af3ebbf674f4 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -451,10 +451,20 @@ def gen_one(f: NativeFunction) -> Optional[str]: """ elif self.target is Target.REGISTRATION: + dispatcher_sig = DispatcherSignature.from_schema(f.func) + if local.use_c10_dispatcher() is UseC10Dispatcher.full: - payload = f'TORCH_FN({sig.name()})' + payload = f"TORCH_FN({sig.name()})" + elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: + payload = f""" +c10::impl::hacky_wrapper_for_legacy_signatures< + {dispatcher_sig.type()}, + {len(f.func.arguments.out)} +>(TORCH_FN({sig.name()})) +""" else: - payload = f'torch::CppFunction::makeUnboxedOnly({sig.name()})' + assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper + payload = f"torch::CppFunction::makeUnboxedOnly(&{sig.name()})" return f'm.impl("{f.func.name}", {payload});' else: assert_never(self.target) @@ -546,9 +556,12 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: if local.use_c10_dispatcher() is UseC10Dispatcher.full: payload = f"TORCH_FN({name})" elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: - payload = "c10::impl::hacky_wrapper_for_legacy_signatures<" \ - f"{dispatcher_sig.type()}>(TORCH_FN({name}))" - + payload = f""" +c10::impl::hacky_wrapper_for_legacy_signatures< + {dispatcher_sig.type()}, + {len(f.func.arguments.out)} +>(TORCH_FN({name})) +""" else: assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper payload = f"torch::CppFunction::makeUnboxedOnly(&{name})" From fce059d4ff6a53a268ce7335aa710cff9ec25ea9 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Thu, 10 Dec 2020 23:35:57 -0800 Subject: [PATCH 167/250] [te] Don't throw when re-registering a CodeGen factory (#49174) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49174 We've seen this happening when libtorch is loaded repeatedly on macOS. Tbh I'm not sure I understand why this happens; why do we re-construct these static objects but re-use the static registry itself? But it's fairly straightforward to just overwrite the factory method and no harm in doing so. ghstack-source-id: 118306581 Test Plan: compile Reviewed By: ZolotukhinM Differential Revision: D25466642 fbshipit-source-id: 4c456a57407f23fa0c9f4e74975ed1186e790c74 --- torch/csrc/jit/tensorexpr/codegen.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp index b8f16c50e05f..7f1f09032555 100644 --- a/torch/csrc/jit/tensorexpr/codegen.cpp +++ b/torch/csrc/jit/tensorexpr/codegen.cpp @@ -30,11 +30,7 @@ RegisterCodeGenList::StmtFactoryMethod RegisterCodeGenList:: void RegisterCodeGenList::AddStmtFactoryMethod( const std::string& name, const StmtFactoryMethod& stmt_factory_method) { - auto insert_ret = - stmt_factory_methods_.insert(std::make_pair(name, stmt_factory_method)); - if (!insert_ret.second) { - throw std::runtime_error("Duplicated CodeGen names: " + name); - } + stmt_factory_methods_[name] = stmt_factory_method; } std::unique_ptr CreateCodeGen( From 8669f02573b0e6a60c3f3fd2200e08e828ab2f20 Mon Sep 17 00:00:00 2001 From: Ailing Zhang Date: Thu, 10 Dec 2020 23:36:53 -0800 Subject: [PATCH 168/250] Saves a copy of vector in view ops returning TensorList. (#49149) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49149 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D25480104 Pulled By: ailzhang fbshipit-source-id: 749345164662b15ec56b7b85a64011929e90c0b2 --- tools/autograd/gen_variable_type.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 5431980a51c3..123b180f1774 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -703,8 +703,9 @@ def wrap_output(return_values, var): creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE" else: creation_meta = "CreationMeta::MULTI_OUTPUT_NODE" - rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_differentiable */ true, " - "/* creation_meta */ {})").format(view_info, var, creation_meta) + call += ("as_view(/* base */ {}, /* output */ {}, /* is_differentiable */ true, " + "/* creation_meta */ {});\n").format(view_info, var, creation_meta) + rhs_value = 'std::move({})'.format(var) else: call += emit_view_lambda() creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE" From 2b1057b0cf2eaea633a90b2e130612d8f93eeb2b Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Thu, 10 Dec 2020 23:45:22 -0800 Subject: [PATCH 169/250] [RPC Framework] Support retrieving the RRef to the remote module (#48983) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48983 Expose an API for users to retrieve the RRef for the underlying module. This would be useful if users would like to run custom code on the remote end for the nn.Module. Original PR issue: RemoteModule enhancements #40550 ghstack-source-id: 118378601 Test Plan: buck test mode/dev-nosan caffe2/test/distributed/rpc:process_group_agent -- RemoteModule Reviewed By: pritamdamania87 Differential Revision: D25386042 fbshipit-source-id: 2dff33e8d5c9770be464eacf0b26c3e82f49a943 --- torch/distributed/nn/api/remote_module.py | 6 +++++- .../distributed/nn/api/remote_module_test.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py index 3ee7e9b2a4b0..678bbf6a96de 100644 --- a/torch/distributed/nn/api/remote_module.py +++ b/torch/distributed/nn/api/remote_module.py @@ -18,9 +18,9 @@ from torch import Tensor, device, dtype, nn from torch.distributed.nn.jit import instantiator from torch.distributed.rpc.utils import _parse_remote_device +from torch.nn import Module from torch.nn.parameter import Parameter from torch.utils.hooks import RemovableHandle -from torch.nn import Module _grad_t = Union[Tuple[Tensor, ...], Tensor] @@ -209,6 +209,10 @@ def remote_parameters(self, recurse: bool = True) -> List[rpc.RRef[Parameter]]: """ return rpc.rpc_sync(self.on, _param_rrefs, args=(self.module_rref, recurse)) + def get_module_rref(self) -> rpc.RRef[nn.Module]: + """Returns the RRef to remote module.""" + return self.module_rref + def register_buffer( self, name: str, tensor: Optional[Tensor], persistent: bool = True ) -> None: diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py index d6b3d816fe68..376fdb8049b9 100644 --- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py +++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py @@ -217,6 +217,21 @@ def test_remote_parameters(self): self.assertEqual(len(param_rrefs), 1) self.assertTrue(torch.equal(param_rrefs[0].to_here(), _PARAM_VAL)) + @dist_utils.dist_init + def test_get_module_rref(self): + if self.rank != 0: + return + dst_worker_name = dist_utils.worker_name((self.rank + 1) % self.world_size) + + # Only test Python nn.Module, because script module methods don't support ``get_module_rref``. + for remote_module in self._create_remote_module_iter( + dst_worker_name, modes=[ModuleCreationMode.MODULE_CTOR] + ): + rref = remote_module.get_module_rref() + self.assertEqual(rref, remote_module.module_rref) + for param in rref.to_here().parameters(): + self.assertTrue(torch.equal(param, _PARAM_VAL)) + @skip_if_lt_x_gpu(1) @dist_utils.dist_init def test_valid_device(self): From 5ab90b2fda9a94e45c26be6e567e8d3fbf0e6dfa Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Fri, 11 Dec 2020 03:33:58 -0800 Subject: [PATCH 170/250] Make CUDAFuture remember and restore current device in callback (#48789) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48789 CUDAFuture aims to "capture" the current state of CUDA-related stuff when the future is marked complete (e.g., by looking at current streams and recording events on them) and then "replicate" a similar state when users synchronize with the result of the future (by synchronizing the current streams with these events). However, one "contextual" aspect of CUDA that we weren't capturing/replicating was the current device. This diff tries to fix that. I must mention that we can only do this for callbacks, while we cannot do it for the wait() method. I don't know if such a discrepancy between the two actually makes the overall behavior _worse_. I'd love to hear people's opinions on this. ghstack-source-id: 118081338 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25210335 fbshipit-source-id: 1d1a3f80b1cc42e5114bc88554ed50617f1aaa90 --- aten/src/ATen/cuda/CUDAFuture.h | 8 ++++++++ torch/lib/c10d/ProcessGroupNCCL.hpp | 1 + 2 files changed, 9 insertions(+) diff --git a/aten/src/ATen/cuda/CUDAFuture.h b/aten/src/ATen/cuda/CUDAFuture.h index 78499f7fc026..4334101478f1 100644 --- a/aten/src/ATen/cuda/CUDAFuture.h +++ b/aten/src/ATen/cuda/CUDAFuture.h @@ -44,6 +44,8 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { } void postMarkCompletedHook(const at::IValue& value) override { + currentDevice_ = c10::cuda::current_device(); + // Extract them once and cache them for later uses. dataPtrs_ = extractDataPtrs(value); @@ -98,6 +100,8 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { } } + c10::cuda::CUDAGuard deviceGuard(currentDevice_); + callback(); }; } @@ -122,6 +126,10 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { // Once WorkNCCL is gone (as part of the Future and Work merge) this should be // fixed. protected: + // The device that was current when markCompleted was called, which we'll + // restore when invoking callbacks. + c10::DeviceIndex currentDevice_; + // The events that correspond to the completion of the async I/O kernels. They // are recorded on the appropriate streams when the future is marked completed // and can then be queried/waited/blocked on. There is one event for each diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 45c82140c484..adbfec445549 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -234,6 +234,7 @@ class ProcessGroupNCCL : public ProcessGroup { return ev.device_index() == data_ptr.device().index(); }) != cudaEvents->end()); } + currentDevice_ = c10::cuda::current_device(); cudaEvents_ = std::move(cudaEvents); dataPtrs_ = std::move(dataPtrs); markCompleted(std::move(value)); From 2f1d1eb7df5e8032392b73751c84025a2aa3d1ee Mon Sep 17 00:00:00 2001 From: Heitor Schueroff Date: Fri, 11 Dec 2020 07:48:32 -0800 Subject: [PATCH 171/250] Revert D25428587: [pytorch][PR] add additional interpolation modes for torch.quantile Test Plan: revert-hammer Differential Revision: D25428587 (https://github.com/pytorch/pytorch/commit/25a8397bf3760d60bd754517d590f15cbd041e25) Original commit changeset: e98d24f6a651 fbshipit-source-id: fb217b8a19e853e83779a4edd312be86b26eb26d --- aten/src/ATen/native/Sorting.cpp | 73 +++++-------------- aten/src/ATen/native/native_functions.yaml | 16 ++-- .../check_backward_compatibility.py | 2 - test/test_reductions.py | 13 +--- torch/_torch_docs.py | 38 ++-------- .../_internal/common_methods_invocations.py | 8 +- 6 files changed, 38 insertions(+), 112 deletions(-) diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index 58bad4ab3bdd..e365d48fdffe 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -91,25 +91,11 @@ void quick_select_template( } while (true); } -void copy_quantile_result(Tensor &out, - const Tensor &result, - const Tensor &q) { - if (q.dim() == 0) { - // If q is scalar, remove last dim to match out shape - result.squeeze_(-1); - } else { - // Move quantiles to first dim to match out shape - result.unsqueeze_(0).transpose_(0, -1).squeeze_(-1); - } - out.copy_(result); -} - void quantile_impl( Tensor& out, const Tensor& self, const Tensor& q, optional _dim, - const std::string interpolation, bool keepdim, bool ignore_nan) { int64_t dim = at::maybe_wrap_dim(_dim.value_or(0), self.dim(), true); @@ -131,12 +117,6 @@ void quantile_impl( TORCH_CHECK( self.device() == out.device(), "quantile() out tensor must be on the same device as the input tensor"); - std::vector interpolations{ - "linear", "lower", "higher", "midpoint", "nearest"}; - TORCH_CHECK( - std::find(interpolations.begin(), interpolations.end(), interpolation) != interpolations.end(), - "quantile() interpolation should only be ", - c10::Join(", ", interpolations), "."); // Compute output shape: q_size + reduced_size std::vector out_shape; @@ -206,32 +186,23 @@ void quantile_impl( at::broadcast_tensors({q * last_index, sorted.isnan().any(-1, true)}); ranks = at::masked_fill(tl[0], tl[1], last_index); } - - // adjust ranks based on the interpolation mode - if (interpolation == "lower") { - ranks.floor_(); - } else if (interpolation == "higher") { - ranks.ceil_(); - } else if (interpolation == "nearest") { - ranks.round_(); - } - Tensor ranks_below = ranks.toType(kLong); - Tensor values_below = sorted.gather(-1, ranks_below); - if (interpolation != "linear" && interpolation != "midpoint") { - copy_quantile_result(out, values_below, q); - return; - } - - // calculate weights for linear and midpoint - Tensor weights = interpolation == "midpoint" ? at::full_like(ranks, 0.5) : ranks - ranks_below; - + Tensor weights = ranks - ranks_below; Tensor ranks_above = ranks.ceil_().toType(kLong); + + Tensor values_below = sorted.gather(-1, ranks_below); Tensor values_above = sorted.gather(-1, ranks_above); // Interpolate to compute quantiles and copy to out tensor values_below.lerp_(values_above, weights); - copy_quantile_result(out, values_below, q); + if (q.dim() == 0) { + // If q is scalar, remove last dim to match out shape + values_below.squeeze_(-1); + } else { + // Move quantiles to first dim to match out shape + values_below.unsqueeze_(0).transpose_(0, -1).squeeze_(-1); + } + out.copy_(values_below); } std::tuple kthvalue_out_impl_cpu( @@ -442,9 +413,8 @@ Tensor& quantile_out( const Tensor& self, const Tensor& q, optional _dim, - const std::string interpolation, bool keepdim) { - quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/false); + quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/false); return out; } @@ -453,7 +423,6 @@ Tensor& quantile_out( const Tensor& self, double q, optional _dim, - const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); @@ -462,7 +431,6 @@ Tensor& quantile_out( self, at::scalar_tensor(q, self.options()), std::move(_dim), - interpolation, keepdim); } @@ -470,10 +438,9 @@ Tensor quantile( const Tensor& self, const Tensor& q, optional _dim, - const std::string interpolation, bool keepdim) { Tensor out = at::empty({0}, self.options()); - quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/false); + quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/false); return out; } @@ -481,12 +448,11 @@ Tensor quantile( const Tensor& self, double q, optional _dim, - const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); return at::quantile( - self, at::scalar_tensor(q, self.options()), std::move(_dim), interpolation, keepdim); + self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim); } Tensor& nanquantile_out( @@ -494,9 +460,8 @@ Tensor& nanquantile_out( const Tensor& self, const Tensor& q, optional _dim, - const std::string interpolation, bool keepdim) { - quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/true); + quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/true); return out; } @@ -505,7 +470,6 @@ Tensor& nanquantile_out( const Tensor& self, double q, optional _dim, - const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); @@ -514,7 +478,6 @@ Tensor& nanquantile_out( self, at::scalar_tensor(q, self.options()), std::move(_dim), - interpolation, keepdim); } @@ -522,10 +485,9 @@ Tensor nanquantile( const Tensor& self, const Tensor& q, optional _dim, - const std::string interpolation, bool keepdim) { Tensor out = at::empty({0}, self.options()); - quantile_impl(out, self, q, std::move(_dim), interpolation, keepdim, /*ignore_nan=*/true); + quantile_impl(out, self, q, std::move(_dim), keepdim, /*ignore_nan=*/true); return out; } @@ -533,12 +495,11 @@ Tensor nanquantile( const Tensor& self, double q, optional _dim, - const std::string interpolation, bool keepdim) { TORCH_CHECK( q >= 0 && q <= 1, "quantile() q must be in the range [0, 1] but got ", q); return at::nanquantile( - self, at::scalar_tensor(q, self.options()), std::move(_dim), interpolation, keepdim); + self, at::scalar_tensor(q, self.options()), std::move(_dim), keepdim); } std::tuple kthvalue_out_cpu( diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 43511e16b054..8a30507203ae 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6754,27 +6754,27 @@ use_c10_dispatcher: full variants: method, function -- func: quantile.scalar_out(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: quantile.scalar(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor +- func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function -- func: quantile.out(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: quantile(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor +- func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function -- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: nanquantile.scalar(Tensor self, float q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor +- func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function -- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +- func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) -- func: nanquantile(Tensor self, Tensor q, int? dim=None, str interpolation='linear', bool keepdim=False) -> Tensor +- func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor use_c10_dispatcher: full variants: method, function diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index e155537d7b99..ccb4a6457537 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -187,8 +187,6 @@ ("aten::ifft", datetime.date(2021, 1, 31)), ("aten::irfft", datetime.date(2021, 1, 31)), ("aten::rfft", datetime.date(2021, 1, 31)), - ("aten::quantile", datetime.date(2021, 1, 31)), - ("aten::nanquantile", datetime.date(2021, 1, 31)), ] def allow_listed(schema, allow_list): diff --git a/test/test_reductions.py b/test/test_reductions.py index eadd96652d0b..7c877d822142 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -1787,17 +1787,14 @@ def test_quantile(self, device, dtype): numpy_op = getattr(np, op) # Compute quantile along every dimension and flattened tensor - interpolations = ('linear', 'lower', 'higher', 'midpoint', 'nearest') - for interpolation, dim in product(interpolations, - [None] + list(range(a.ndim))): - result = torch_op(a, q, dim, interpolation, keepdim) - expected = numpy_op(a.cpu().numpy(), q.cpu().numpy(), dim, - interpolation=interpolation, keepdims=keepdim) + for dim in [None] + list(range(a.ndim)): + result = torch_op(a, q, dim, keepdim) + expected = numpy_op(a.cpu().numpy(), q.cpu().numpy(), dim, keepdims=keepdim) self.assertEqual(result.cpu(), torch.from_numpy(np.array(expected)).type(result.type())) # Test out variation out = torch.empty_like(result) - torch_op(a, q, dim, interpolation, keepdim, out=out) + torch_op(a, q, dim, keepdim, out=out) self.assertEqual(out.cpu(), result.cpu()) def test_quantile_backward(self, device): @@ -1831,8 +1828,6 @@ def check(a, q, args, kwargs, message): check([1.], 1.1, [], {}, r'q must be in the range \[0, 1\] but got 1.1') check([1.], 0.5, [], {'out': torch.empty([], dtype=torch.float64, device=device)}, r'out tensor must be same dtype as the input tensor') - check([1.], [1.], [], {'interpolation': 'random_mode'}, - r"interpolation should only be linear, lower, higher, midpoint, nearest.") if self.device_type == "cpu": check([1.], [0.5, 1.1, -1], [], {}, r'q values must be in the range \[0, 1\]') diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 6afd68ab2404..d9f7e8018264 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -5182,24 +5182,13 @@ def merge_dicts(*dicts): >>> torch.quantile(a, q) tensor([-0.5446, 0.0700, 0.9214]) -.. function:: quantile(input, q, dim=None, interpolation='linear', keepdim=False, *, out=None) -> Tensor +.. function:: quantile(input, q, dim=None, keepdim=False, *, out=None) -> Tensor -Returns the q-th quantiles of each row of the :attr:`input` tensor -along the dimension :attr:`dim` based on :attr:`interpolation`. -When the desired quantile lies between two data points ``i < j``, -the result is computed based on the :attr:`interpolation` value as described below. -By default, :attr:`interpolation` is ``linear`` and :attr:`dim` is ``None`` resulting in the :attr:`input` tensor +Returns the q-th quantiles of each row of the :attr:`input` tensor along the dimension +:attr:`dim`, doing a linear interpolation when the q-th quantile lies between two +data points. By default, :attr:`dim` is ``None`` resulting in the :attr:`input` tensor being flattened before computation. -When the quantile value lies between two data points ``i < j``, -the result is computed according to the given :attr:`interpolation` method as follows: - -- ``linear``: ``i + (j - i) * fraction``, where ``fraction`` is the fractional part of the index surrounded by ``i`` and ``j``. -- ``lower``: ``i``. -- ``higher``: ``j``. -- ``nearest``: ``i`` or ``j``, whichever is nearest. -- ``midpoint``: ``(i + j) / 2``. - If :attr:`keepdim` is ``True``, the output dimensions are of the same size as :attr:`input` except in the dimensions being reduced (:attr:`dim` or all if :attr:`dim` is ``None``) where they have size 1. Otherwise, the dimensions being reduced are squeezed (see :func:`torch.squeeze`). @@ -5210,8 +5199,6 @@ def merge_dicts(*dicts): {input} q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1] {dim} - interpolation (string): interpolation method to use when the desired quantile lies between two data points, - can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. Default is ``linear``. {keepdim} Keyword arguments: @@ -5235,24 +5222,11 @@ def merge_dicts(*dicts): [ 0.9206]]]) >>> torch.quantile(a, q, dim=1, keepdim=True).shape torch.Size([3, 2, 1]) - >>> a = torch.arange(4.) - >>> a - tensor([0., 1., 2., 3.]) - >>> torch.quantile(a, 0.6, interpolation='linear') - tensor(1.8000) - >>> torch.quantile(a, 0.6, interpolation='lower') - tensor(1.) - >>> torch.quantile(a, 0.6, interpolation='higher') - tensor(2.) - >>> torch.quantile(a, 0.6, interpolation='midpoint') - tensor(1.5000) - >>> torch.quantile(a, 0.6, interpolation='nearest') - tensor(2.) """.format(**single_dim_common)) add_docstr(torch.nanquantile, r""" -nanquantile(input, q, dim=None, interpolation='linear', keepdim=False, *, out=None) -> Tensor +nanquantile(input, q, dim=None, keepdim=False, *, out=None) -> Tensor This is a variant of :func:`torch.quantile` that "ignores" ``NaN`` values, computing the quantiles :attr:`q` as if ``NaN`` values in :attr:`input` did @@ -5263,8 +5237,6 @@ def merge_dicts(*dicts): {input} q (float or Tensor): a scalar or 1D tensor of quantile values in the range [0, 1] {dim} - interpolation (string): interpolation method to use when the desired quantile lies between two data points, - can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``. Default is ``linear``. {keepdim} Keyword arguments: diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 621ff2a12e7e..b88dcaaccb33 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1272,13 +1272,13 @@ def method_tests(): ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', (), [1]), ('quantile', (S, S, S), (0.5,)), ('quantile', (S, S, S), (0.5, 0), 'dim', (), [1]), - ('quantile', (S, S, S), (0.5, None, 'linear', True), 'keepdim'), - ('quantile', (S, S, S), (0.5, 0, 'linear', True), 'keepdim_dim', (), [1]), + ('quantile', (S, S, S), (0.5, None, True), 'keepdim'), + ('quantile', (S, S, S), (0.5, 0, True), 'keepdim_dim', (), [1]), ('quantile', (), (0.5,), 'scalar'), ('nanquantile', (S, S, S), (0.5,)), ('nanquantile', (S, S, S), (0.5, 0), 'dim', (), [1]), - ('nanquantile', (S, S, S), (0.5, None, 'linear', True), 'keepdim'), - ('nanquantile', (S, S, S), (0.5, 0, 'linear', True), 'keepdim_dim', (), [1]), + ('nanquantile', (S, S, S), (0.5, None, True), 'keepdim'), + ('nanquantile', (S, S, S), (0.5, 0, True), 'keepdim_dim', (), [1]), ('nanquantile', (), (0.5,), 'scalar'), ('median', (S, S, S), NO_ARGS), ('median', (S, S, S), (1,), 'dim', (), [0]), From 88b3d3371b668e2e3f89218629d764a8e5868f0c Mon Sep 17 00:00:00 2001 From: Rong Rong Date: Fri, 11 Dec 2020 08:08:40 -0800 Subject: [PATCH 172/250] add additional arm64 checker in cmake files (#48952) Summary: tentatively fixes https://github.com/pytorch/pytorch/issues/48873 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48952 Reviewed By: H-Huang Differential Revision: D25463266 Pulled By: walterddr fbshipit-source-id: 40afefffe8ab98ae7261c770316cb9c25225285f --- aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt | 2 +- cmake/External/nnpack.cmake | 2 +- third_party/NNPACK | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt index 9f8cb6d9ed09..99bf8ba07074 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/CMakeLists.txt @@ -271,7 +271,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7") set_property(SOURCE ${PYTORCH_QNNPACK_AARCH32_ASM_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -arch ${IOS_ARCH} ") endif() endif() -if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR IOS_ARCH MATCHES "^arm64.*") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)$" OR IOS_ARCH MATCHES "^arm64.*") set_property(SOURCE ${PYTORCH_QNNPACK_ARM_NEON_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O2 ") if(IOS) set_property(SOURCE ${PYTORCH_QNNPACK_AARCH64_ASM_UKERNELS} APPEND_STRING PROPERTY COMPILE_FLAGS " -arch ${IOS_ARCH} ") diff --git a/cmake/External/nnpack.cmake b/cmake/External/nnpack.cmake index 84244dc864c3..b1dcd728e690 100644 --- a/cmake/External/nnpack.cmake +++ b/cmake/External/nnpack.cmake @@ -27,7 +27,7 @@ endif() # (2) Anything but x86, x86-64, ARM, ARM64 - unsupported ############################################################################## if(CMAKE_SYSTEM_PROCESSOR) - if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(i686|x86_64|armv5te|armv7-a|armv7l|aarch64)$") + if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(i686|x86_64|armv5te|armv7-a|armv7l|arm64|aarch64)$") message(WARNING "NNPACK is not supported on ${CMAKE_SYSTEM_PROCESSOR} processors. " "The only supported architectures are x86, x86-64, ARM, and ARM64. " "Turn this warning off by USE_NNPACK=OFF.") diff --git a/third_party/NNPACK b/third_party/NNPACK index 24b55303f5cf..57616b9a0ef7 160000 --- a/third_party/NNPACK +++ b/third_party/NNPACK @@ -1 +1 @@ -Subproject commit 24b55303f5cf65d75844714513a0d1b1409809bd +Subproject commit 57616b9a0ef7b0f8e56bfe7e9738744b52fe1828 From 2bb2f641c47e2705b8d2bc9514c67764182b1462 Mon Sep 17 00:00:00 2001 From: Sam Estep Date: Fri, 11 Dec 2020 08:15:13 -0800 Subject: [PATCH 173/250] Bring fast_nvcc.py to PyTorch OSS (#48934) Summary: This PR adds `tools/fast_nvcc/fast_nvcc.py`, a mostly-transparent wrapper over `nvcc` that parallelizes compilation of CUDA files when building for multiple architectures at once. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48934 Test Plan: Currently this script isn't actually used in PyTorch OSS. Coming soon! Reviewed By: walterddr Differential Revision: D25286030 Pulled By: samestep fbshipit-source-id: 971a404cf57f5694dea899a27338520d25191706 --- tools/README.md | 5 + tools/fast_nvcc/fast_nvcc.py | 463 +++++++++++++++++++++++++++++++++++ 2 files changed, 468 insertions(+) create mode 100755 tools/fast_nvcc/fast_nvcc.py diff --git a/tools/README.md b/tools/README.md index 527351d1c84a..b940d378320b 100644 --- a/tools/README.md +++ b/tools/README.md @@ -29,6 +29,11 @@ Build system pieces: * [build_libtorch.py](build_libtorch.py) - Script for building libtorch, a standalone C++ library without Python support. This build script is tested in CI. +* [fast_nvcc](fast_nvcc) - Mostly-transparent wrapper over nvcc that + parallelizes compilation when used to build CUDA files for multiple + architectures at once. + * [fast_nvcc.py](fast_nvcc/fast_nvcc.py) - Python script, entrypoint to the + fast nvcc wrapper. Developer tools which you might find useful: diff --git a/tools/fast_nvcc/fast_nvcc.py b/tools/fast_nvcc/fast_nvcc.py new file mode 100755 index 000000000000..2a8d1d731453 --- /dev/null +++ b/tools/fast_nvcc/fast_nvcc.py @@ -0,0 +1,463 @@ +#!/usr/bin/env python3 + +import argparse +import asyncio +import collections +import csv +import hashlib +import itertools +import os +import pathlib +import re +import shlex +import shutil +import subprocess +import sys +import time + + +help_msg = '''fast_nvcc [OPTION]... -- [NVCC_ARG]... + +Run the commands given by nvcc --dryrun, in parallel. + +All flags for this script itself (see the "optional arguments" section +of --help) must be passed before the first "--". Everything after that +first "--" is passed directly to nvcc, with the --dryrun argument added. + +This script only works with the "normal" execution path of nvcc, so for +instance passing --help (after "--") doesn't work since the --help +execution path doesn't compile anything, so adding --dryrun there gives +nothing in stderr. +''' +parser = argparse.ArgumentParser(help_msg) +parser.add_argument( + '--faithful', + action='store_true', + help="don't modify the commands given by nvcc (slower)", +) +parser.add_argument( + '--graph', + metavar='FILE.dot', + help='write Graphviz DOT file with execution graph', +) +parser.add_argument( + '--nvcc', + metavar='PATH', + default='nvcc', + help='path to nvcc (default is just "nvcc")', +) +parser.add_argument( + '--save', + metavar='DIR', + help='copy intermediate files from each command into DIR', +) +parser.add_argument( + '--sequential', + action='store_true', + help='sequence commands instead of using the graph (slower)', +) +parser.add_argument( + '--table', + metavar='FILE.csv', + help='write CSV with times and intermediate file sizes', +) +parser.add_argument( + '--verbose', + metavar='FILE.txt', + help='like nvcc --verbose, but expanded and into a file', +) +default_config = parser.parse_args([]) + + +# docs about temporary directories used by NVCC +url_base = 'https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html' +url_vars = f'{url_base}#keeping-intermediate-phase-files' + + +# regex for temporary file names +re_tmp = r'(? '{filename}'") + uniqueified.append(line) + return uniqueified + + +def make_rm_force(commands): + """ + Add --force to all rm commands. + """ + return [f'{c} --force' if c.startswith('rm ') else c for c in commands] + + +def print_verbose_output(*, env, commands, filename): + """ + Human-readably write nvcc --dryrun data to stderr. + """ + padding = len(str(len(commands) - 1)) + with open(filename, 'w') as f: + for name, val in env.items(): + print(f'#{" "*padding}$ {name}={val}', file=f) + for i, command in enumerate(commands): + prefix = f'{str(i).rjust(padding)}$ ' + print(f'#{prefix}{command[0]}', file=f) + for part in command[1:]: + print(f'#{" "*len(prefix)}{part}', file=f) + + +def straight_line_dependencies(commands): + """ + Return a straight-line dependency graph. + """ + return [({i - 1} if i > 0 else set()) for i in range(len(commands))] + + +def files_mentioned(command): + """ + Return fully-qualified names of all tmp files referenced by command. + """ + return [f'/tmp/{match.group(1)}' for match in re.finditer(re_tmp, command)] + + +def nvcc_data_dependencies(commands): + """ + Return a list of the set of dependencies for each command. + """ + # fatbin needs to be treated specially because while the cicc steps + # do refer to .fatbin.c files, they do so through the + # --include_file_name option, since they're generating files that + # refer to .fatbin.c file(s) that will later be created by the + # fatbinary step; so for most files, we make a data dependency from + # the later step to the earlier step, but for .fatbin.c files, the + # data dependency is sort of flipped, because the steps that use the + # files generated by cicc need to wait for the fatbinary step to + # finish first + tmp_files = {} + fatbins = collections.defaultdict(set) + graph = [] + for i, line in enumerate(commands): + deps = set() + for tmp in files_mentioned(line): + if tmp in tmp_files: + dep = tmp_files[tmp] + deps.add(dep) + if dep in fatbins: + for filename in fatbins[dep]: + if filename in tmp_files: + deps.add(tmp_files[filename]) + if tmp.endswith('.fatbin.c') and not line.startswith('fatbinary'): + fatbins[i].add(tmp) + else: + tmp_files[tmp] = i + if line.startswith('rm ') and not deps: + deps.add(i - 1) + graph.append(deps) + return graph + + +def is_weakly_connected(graph): + """ + Return true iff graph is weakly connected. + """ + neighbors = [set() for _ in graph] + for node, predecessors in enumerate(graph): + for pred in predecessors: + neighbors[pred].add(node) + neighbors[node].add(pred) + # assume nonempty graph + stack = [0] + found = {0} + while stack: + node = stack.pop() + for neighbor in neighbors[node]: + if neighbor not in found: + found.add(neighbor) + stack.append(neighbor) + return len(found) == len(graph) + + +def warn_if_not_weakly_connected(graph): + """ + Warn the user if the execution graph is not weakly connected. + """ + if not is_weakly_connected(graph): + fast_nvcc_warn('execution graph is not (weakly) connected') + + +def print_dot_graph(*, commands, graph, filename): + """ + Print a DOT file displaying short versions of the commands in graph. + """ + def name(k): + return f'"{k} {os.path.basename(commands[k][0])}"' + with open(filename, 'w') as f: + print('digraph {', file=f) + # print all nodes, in case it's disconnected + for i in range(len(graph)): + print(f' {name(i)};', file=f) + for i, deps in enumerate(graph): + for j in deps: + print(f' {name(j)} -> {name(i)};', file=f) + print('}', file=f) + + +async def run_command(command, *, env, deps, gather_data, i, save): + """ + Run the command with the given env after waiting for deps. + """ + for task in deps: + await task + if gather_data: + t1 = time.monotonic() + proc = await asyncio.create_subprocess_shell( + command, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + code = proc.returncode + results = {'exit_code': code, 'stdout': stdout, 'stderr': stderr} + if gather_data: + t2 = time.monotonic() + results['time'] = t2 - t1 + sizes = {} + for tmp_file in files_mentioned(command): + if os.path.exists(tmp_file): + sizes[tmp_file] = os.path.getsize(tmp_file) + else: + sizes[tmp_file] = 0 + results['files'] = sizes + if save: + dest = pathlib.Path(save) / str(i) + dest.mkdir() + for src in map(pathlib.Path, files_mentioned(command)): + if src.exists(): + shutil.copy2(src, dest / (src.name)) + return results + + +async def run_graph(*, env, commands, graph, gather_data, save): + """ + Return outputs/errors (and optionally time/file info) from commands. + """ + tasks = [] + for i, (command, indices) in enumerate(zip(commands, graph)): + deps = {tasks[j] for j in indices} + tasks.append(asyncio.create_task(run_command( + command, + env=env, + deps=deps, + gather_data=gather_data, + i=i, + save=save, + ))) + return [await task for task in tasks] + + +def print_command_outputs(command_results): + """ + Print captured stdout and stderr from commands. + """ + for result in command_results: + sys.stdout.write(result['stdout'].decode('ascii')) + sys.stderr.write(result['stderr'].decode('ascii')) + + +def write_log_csv(command_parts, command_results, *, filename): + """ + Write a CSV file of the times and /tmp file sizes from each command. + """ + tmp_files = [] + for result in command_results: + tmp_files.extend(result['files'].keys()) + with open(filename, 'w', newline='') as csvfile: + fieldnames = ['command', 'seconds'] + list(dict.fromkeys(tmp_files)) + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for i, result in enumerate(command_results): + command = f'{i} {os.path.basename(command_parts[i][0])}' + row = {'command': command, 'seconds': result['time']} + writer.writerow({**row, **result['files']}) + + +def exit_code(results): + """ + Aggregate individual exit codes into a single code. + """ + for result in results: + code = result['exit_code'] + if code != 0: + return code + return 0 + + +def fast_nvcc(args, *, config=default_config): + """ + Emulate the result of calling the given nvcc binary with args. + + Should run faster than plain nvcc. + """ + warn_if_windows() + warn_if_tmpdir_flag(args) + dryrun_data = nvcc_dryrun_data(config.nvcc, args) + env = dryrun_data['env'] + warn_if_tmpdir_set(env) + commands = dryrun_data['commands'] + if not config.faithful: + commands = make_rm_force(unique_module_id_files(commands)) + command_parts = list(map(shlex.split, commands)) + if config.verbose: + print_verbose_output( + env=env, + commands=command_parts, + filename=config.verbose, + ) + graph = nvcc_data_dependencies(commands) + warn_if_not_weakly_connected(graph) + if config.graph: + print_dot_graph( + commands=command_parts, + graph=graph, + filename=config.graph, + ) + if config.sequential: + graph = straight_line_dependencies(commands) + results = asyncio.run(run_graph( + env=env, + commands=commands, + graph=graph, + gather_data=bool(config.table), + save=config.save, + )) + print_command_outputs(results) + if config.table: + write_log_csv(command_parts, results, filename=config.table) + return exit_code([dryrun_data] + results) + + +def our_arg(arg): + return arg != '--' + + +if __name__ == '__main__': + argv = sys.argv[1:] + us = list(itertools.takewhile(our_arg, argv)) + them = list(itertools.dropwhile(our_arg, argv)) + sys.exit(fast_nvcc(them[1:], config=parser.parse_args(us))) From dcd1e3d78d3163aec75a2eb1aedb4241e01a9c78 Mon Sep 17 00:00:00 2001 From: generatedunixname89002005325676 Date: Fri, 11 Dec 2020 08:39:25 -0800 Subject: [PATCH 174/250] [AutoAccept][Codemod][FBSourceClangFormatLinter] Daily `arc lint --take CLANGFORMAT` Reviewed By: zertosh Differential Revision: D25490983 fbshipit-source-id: b24a11214a485a4a24ccf7da1e72715b450d3a81 --- test/cpp/tensorexpr/test_kernel.cpp | 2 +- torch/csrc/jit/tensorexpr/kernel.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index 895b025ac4e0..cf658ad488f6 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -522,7 +522,7 @@ TEST(Kernel, DISABLED_SumAllAxes) { std::string li_to_str(at::ArrayRef li) { std::stringstream out; bool first = true; - for (auto elem: li) { + for (auto elem : li) { if (!first) { out << ", "; } diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index c4228ae955b6..88cf5761cfa1 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -8,7 +9,6 @@ #include #include #include -#include using namespace torch::jit; using namespace torch::jit::tensorexpr; From f204f77e6d0933b274c5ebc5c8f7ce1e1ee1c2fd Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Fri, 11 Dec 2020 09:23:09 -0800 Subject: [PATCH 175/250] Drop FutureNCCL in favor of vanilla CUDAFuture (#49014) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49014 We extracted a generic and reusable CUDAFuture class from FutureNCCL, but we had left FutureNCCL around, as a subclass of CUDAFuture, in order to deal with some peculiarity of ProcessGroupNCCL, namely that the future would be completed right away when constructed and that its CUDA events would be _shared_ with the ones of the WorkNCCL. This required some "hacks" in CUDAFuture itself (protected members, fields wrapped in shared_ptrs, ...). My understanding is that creating CUDA events is a rather cheap operation. That would mean that we could afford to record _twice_ the events after each NCCL call, once for the WorkNCCL and once for the future. By doing so, we can use the CUDAFuture class directly and revert all its hacks. ghstack-source-id: 118391217 Test Plan: Unit tests Reviewed By: mrshenli Differential Revision: D25355272 fbshipit-source-id: 3a2a0891724928221ff0f08600675d2f5990e674 --- aten/src/ATen/cuda/CUDAFuture.h | 22 +++------- torch/csrc/distributed/c10d/init.cpp | 15 +++---- torch/lib/c10d/ProcessGroupNCCL.cpp | 22 ++++++++-- torch/lib/c10d/ProcessGroupNCCL.hpp | 64 +++------------------------- 4 files changed, 37 insertions(+), 86 deletions(-) diff --git a/aten/src/ATen/cuda/CUDAFuture.h b/aten/src/ATen/cuda/CUDAFuture.h index 4334101478f1..ae43fb2a2dd6 100644 --- a/aten/src/ATen/cuda/CUDAFuture.h +++ b/aten/src/ATen/cuda/CUDAFuture.h @@ -21,7 +21,7 @@ namespace at { namespace cuda { -struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { +struct TORCH_CUDA_API CUDAFuture final : at::ivalue::Future { public: using at::ivalue::Future::Future; @@ -56,12 +56,11 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { } } - cudaEvents_ = std::make_shared>(); for (c10::DeviceIndex idx = 0; idx < isCudaDeviceUsed.size(); idx++) { if (isCudaDeviceUsed[idx]) { at::cuda::CUDAEvent cudaEvent; cudaEvent.record(at::cuda::getCurrentCUDAStream(idx)); - (*cudaEvents_).push_back(std::move(cudaEvent)); + cudaEvents_.push_back(std::move(cudaEvent)); } } } @@ -78,7 +77,7 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { // misbehaving this also ends up using memory on those devices, which the // user might not want. std::vector streams; - for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + for (at::cuda::CUDAEvent& cudaEvent : cudaEvents_) { c10::DeviceIndex idx = cudaEvent.device_index(); // FIXME Should we find a way to allow to change the priority of // streams? @@ -107,7 +106,7 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { } void postWaitHook(const at::IValue& value) override { - for (at::cuda::CUDAEvent& cudaEvent : *cudaEvents_) { + for (at::cuda::CUDAEvent& cudaEvent : cudaEvents_) { cudaEvent.block( at::cuda::getCurrentCUDAStream(cudaEvent.device_index())); } @@ -120,12 +119,7 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { } } - // FIXME This field is protected (rather than private) and wrapped in a - // shared_ptr in order to support the FutureNCCL subclass, which wants to set - // the events on its own in order to use the same ones as its WorkNCCL class. - // Once WorkNCCL is gone (as part of the Future and Work merge) this should be - // fixed. - protected: + private: // The device that was current when markCompleted was called, which we'll // restore when invoking callbacks. c10::DeviceIndex currentDevice_; @@ -134,19 +128,15 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future { // are recorded on the appropriate streams when the future is marked completed // and can then be queried/waited/blocked on. There is one event for each // distinct device on which the value's tensors reside. - std::shared_ptr> cudaEvents_; + std::vector cudaEvents_; // A cached version of the data ptrs extracted from the value when the future // is first marked completed. std::vector> dataPtrs_; - private: DataPtrExtractor dataPtrExtractor_; std::mutex dataPtrExtractorMutex_; - // FIXME This too is protected so that it can be used by FutureNCCL. Please - // undo that once FutureNCCL is dropped in favor of a "vanilla" CUDAFuture. - protected: std::vector> extractDataPtrs( const at::IValue& value) { std::unique_lock lock(dataPtrExtractorMutex_); diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 54fc33e54424..0a7daa3a5b94 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1127,24 +1127,23 @@ that adds a prefix to each key inserted to the store. >>> ddp_model._egister_comm_hook(state = None, hook = allreduce) .. warning :: - ``get_future`` API supports only NCCL backend and single-process single-device mode. + ``get_future`` API supports only NCCL backend. The ``torch._C.Future`` object returned by this API can be used in - ``DistributedDataParallel.register_comm_hook``, but it is subject to some subtle - differences compared to ``torch.futures.Future`` due to compromises made for performance - reasons. + ``DistributedDataParallel.register_comm_hook``, and adds some CUDA-specific + features on top of ``torch.futures.Future``. In the example above, ``allreduce`` work will be done on GPU using NCCL backend, ``fut.wait()`` will return after synchronizing the appropriate NCCL streams - with PyTorch's default device streams to ensure we can have asynchronous CUDA + with PyTorch's current device streams to ensure we can have asynchronous CUDA execution and it does not wait for the entire operation to complete on GPU. Note that - ``FutureNCCL`` does not support ``NCCL_BLOCKING_WAIT`` flag or NCCL's ``barrier()``. + ``CUDAFuture`` does not support ``NCCL_BLOCKING_WAIT`` flag or NCCL's ``barrier()``. In addition, if a callback function was added by ``fut.then()``, it will wait until ``WorkNCCL``'s NCCL streams synchronize with ``ProcessGroupNCCL``'s dedicated callback stream and invoke the callback inline after running the callback on the callback stream. - ``fut.then()`` will return another ``FutureNCCL`` that holds the return value of the + ``fut.then()`` will return another ``CUDAFuture`` that holds the return value of the callback and a ``CUDAEvent`` that recorded the callback stream. - Note that ``fut.done()`` returns if the enire operation is completed on the GPU. + Note that ``fut.done()`` returns only whether the operation has been enqueued on the GPU. )"); module.def( diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 5152ce01e25e..19085f155020 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1008,7 +1008,7 @@ std::vector ProcessGroupNCCL::WorkNCCL::result() { c10::intrusive_ptr ProcessGroupNCCL::WorkNCCL:: getFuture() { - return c10::make_intrusive(at::IValue(*outputs_), cudaEvents_); + return future_; } void ProcessGroupNCCL::workEnqueue( @@ -1046,7 +1046,7 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( bool can_profile = outputs.size() == 1; auto work = initWork(devices, rank_, opType, can_profile ? profilingTitle : nullptr); - // Store references to outputs to be used by WorkNCCL::getFuture. + // Store references to outputs to be used by WorkNCCL::result and operator<<. work->outputs_ = std::make_shared>(outputs); at::cuda::OptionalCUDAGuard gpuGuard; @@ -1088,6 +1088,13 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( work->ncclComms_[i] = ncclComms[i]; } + { + at::cuda::CUDAMultiStreamGuard streamGuard(ncclStreams_[key]); + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get())); + work->future_->markCompleted(at::IValue(*work->outputs_)); + } + // Set appropriate work parameters. work->blockingWait_ = blockingWait_; work->opTimeout_ = opTimeout_; @@ -1097,7 +1104,7 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( // recordFunctionEndCallback_ is normally called in fininsh() function by // base class, but since finish is not called by WorkNCCL, we schedule this // function to be run when work is done. Note that addCallback() onto the - // Work's futureNCCL is not useful here, as it would just run the callback + // Work's CUDAFuture is not useful here, as it would just run the callback // inline. // Note when can_profile is false, profilingTitle is not provided and so, // recordFunctionEndCallback_ is not set. @@ -1132,7 +1139,7 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( auto work = initWork(devices, rank_, opType); if (opType == OpType::RECV) { - // Store references to outputs to be used by WorkNCCL::getFuture. + // Store references to outputs to be used by WorkNCCL::result and operator<<. work->outputs_ = std::make_shared>(tensors); } @@ -1178,6 +1185,13 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( work->store_ = store_; } + if (opType == OpType::RECV) { + at::cuda::CUDAMultiStreamGuard streamGuard(ncclStreams_[key]); + work->future_ = c10::make_intrusive( + c10::ListType::create(c10::TensorType::get())); + work->future_->markCompleted(at::IValue(*work->outputs_)); + } + return work; } diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index adbfec445549..4d9dc3bd1ae8 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -110,7 +110,7 @@ class ProcessGroupNCCL : public ProcessGroup { bool finishedGPUExecution(); // Get a Future object that will be marked as completed internally. - // It actually returns a FutureNCCL object which is a sub class Future. + // It actually returns a CUDAFuture object which is a sub class of Future. c10::intrusive_ptr getFuture() override; // Helper function that sets an exception_ptr on the WorkNCCL object. @@ -170,9 +170,13 @@ class ProcessGroupNCCL : public ProcessGroup { // to the store. c10::intrusive_ptr store_; - // Store a reference to NCCL collective's outputs to be used by getFuture. + // Store a reference to NCCL collective's outputs, used by result and to + // give a more descriptive message when representing the Work as a string. std::shared_ptr> outputs_; + // The future returned by getFuture. + c10::intrusive_ptr future_; + friend class ProcessGroupNCCL; }; @@ -190,62 +194,6 @@ class ProcessGroupNCCL : public ProcessGroup { bool isHighPriorityStream; }; - // FutureNCCL is a subclass of ivalue's Future. The goal is to use - // this class in getFuture API of WorkNCCL. This Future is mostly a - // wrapper to synchronize streams appropriately and it mostly enables - // the async programming model of CUDA while trying to adhere to the - // Future interface. FutureNCCL does not support NCCL_BLOCKING_WAIT flag - // or NCCL's barrier(). - // - // If created by WorkNCCL's getFuture API, FutureNCCL has a reference to - // WorkNCCL's cudaEvents, NCCL collective's outputs, and the device indices of - // outputs' devices. Its value is NCCL collective's outputs. - // - // If created by FutureNCCL's then callback, its value becomes the value of - // callback() and its cudaEvents will record the NCCL stream that runs that - // callback. Before invoking the callback, FutureNCCL will synchronize its - // own cudaEvents with the stream that runs the callback. This design - // enables synchronizing the appropriate streams and avoids stalling PyTorch's - // default stream while running the callback. In case of multiple then - // callbacks, each will be executed on its own fresh stream. - struct FutureNCCL : at::cuda::CUDAFuture { - public: - FutureNCCL( - at::IValue value, - std::shared_ptr> cudaEvents) - : at::cuda::CUDAFuture(c10::ListType::create(c10::TensorType::get())){ - // Check that the device indices are distinct - std::unordered_set uniqueDeviceIndices; - for (const at::cuda::CUDAEvent& event : *cudaEvents) { - TORCH_INTERNAL_ASSERT(event.isCreated()); - uniqueDeviceIndices.insert(event.device_index()); - } - TORCH_INTERNAL_ASSERT( - cudaEvents->size() == uniqueDeviceIndices.size(), - "Got ", cudaEvents->size(), " events, but only ", - uniqueDeviceIndices.size(), " distinct devices"); - auto dataPtrs = extractDataPtrs(value); - for (const at::DataPtr& data_ptr : dataPtrs) { - TORCH_INTERNAL_ASSERT( - std::find_if( - cudaEvents->begin(), - cudaEvents->end(), - [&](const at::cuda::CUDAEvent& ev) { - return ev.device_index() == data_ptr.device().index(); - }) != cudaEvents->end()); - } - currentDevice_ = c10::cuda::current_device(); - cudaEvents_ = std::move(cudaEvents); - dataPtrs_ = std::move(dataPtrs); - markCompleted(std::move(value)); - } - - protected: - void postMarkCompletedHook(const at::IValue& value) override { - // Do nothing because the constructor already stored the events. - } - }; - // If you wish to create multiple process groups, each with a potentially // different rank and size, you can do so by passing a new store instance // to each one. If you have only a single store object, you can From f10b53d9eaa282d240fa79ee8f03ea42457803d5 Mon Sep 17 00:00:00 2001 From: Dhruv Matani Date: Fri, 11 Dec 2020 09:38:53 -0800 Subject: [PATCH 176/250] [PyTorch Mobile] Record dtypes for tensors used in kernel function implementations (#48826) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48826 This change updates various macros to pass in the kernel tag string (`const char*`) into the macro that sets up the `case` statement for the dtype switch. This macro already receives the dtype (enum) which we also need. There are 2 phases we need to build out for the `dtype` tracing to work: 1. Recording Phase 2. Conditional Compilation Phase For this most part, this change is trying to focus on [1] (The Recording Phase) and sets up a new `RecordScope` enum value to track kernel dtypes. This code is compiled in only if a specific macro is defined (since this is an **extremely** hot code path, and even the slightest regression here can cause tremendous slow down overall). I have only added a skeleton of the phase [2] (Conditional Compilation Phase) and there is a no-op `constexpr` method that selects every dtype in the kernel implementation. In subsequent diffs, this will be updated to point to a code-generated function based on the result of tracing the models that were requested. ghstack-source-id: 118336675 Test Plan: See the next few diff in the stack for the application of this change to both record triggered dtypes (in kernel functions) as well as select dtype specific portions of kernel functions. Reviewed By: ezyang Differential Revision: D24220926 fbshipit-source-id: d7dbf21c7dcc6ce981d0fd4dcb62ca829fe3f69d --- aten/src/ATen/Dispatch.h | 451 ++++++++++++------ aten/src/ATen/native/cpu/SortingKernel.cpp | 8 +- .../ATen/native/cuda/ScatterGatherKernel.cu | 8 +- aten/src/ATen/native/cuda/TriangularOps.cu | 2 +- aten/src/ATen/record_function.h | 2 + 5 files changed, 305 insertions(+), 166 deletions(-) diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 9f0c51166172..41252609953f 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -2,17 +2,59 @@ #include #include +#include #include #include #include +#include #include +#include + +namespace at { +/** + * The method should_include_kernel_dtype() returns true/false + * based on whether the switching code for a specific dtype should be + * included based on build time constants generated from tracing model + * execution. This method will be implmeneted via code-generation and + * included in this file when code-gen is ready. + */ +inline constexpr bool should_include_kernel_dtype( + const char *kernel_tag_str, + at::ScalarType scalar_type +) { + return true; +} +} + +/** + * In the Facebook internal build (using BUCK), this macro is enabled by + * passing in -c pt.enable_record_kernel_dtype=1 when building the tracer + * binary. + */ +#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE +#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) \ + {RECORD_FUNCTION_WITH_SCOPE( \ + at::RecordScope::KERNEL_FUNCTION_DTYPE, \ + std::string(NAME) + "$" + toString(enum_type), \ + {});} +#else +#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) +#endif -#define AT_PRIVATE_CASE_TYPE(enum_type, type, ...) \ - case enum_type: { \ - using scalar_t = type; \ - return __VA_ARGS__(); \ +#define AT_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \ + case enum_type: { \ + at::guts::if_constexpr<(!at::should_include_kernel_dtype(NAME, enum_type))>( \ + [&] { \ + AT_ERROR("dtype '", toString(enum_type), "' not selected for kernel tag ", #NAME); \ + } \ + ); \ + using HINT = type; \ + return __VA_ARGS__(); \ } +#define AT_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \ + AT_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, scalar_t, __VA_ARGS__) + // Workaround for C10_UNUSED because CUDA 10.1 and below fails to handle unused // attribute in the type aliasing context. Keep name long and verbose to avoid // macro collisions. @@ -143,6 +185,21 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} // 4. Should complex be supported? The answer is almost always no, // unless you are working on "generic" code that should work on // all dtypes. +// +// Parameters: +// ----------- +// +// 1. The NAME argument is a "tag" that is used to trace and then +// conditionally compile fragments of the case statements such +// that the kernel functions are specialized only for the dtypes +// that are needed. The NAME parameter *must* be a build time +// cons char* (can't be std::string, etc...) +// +// Please ensure that the NAME is unique for every implementation +// or you run the risk of over-including code for the kernel +// functions. There is no risk of missing out on any code, so +// it's mostly a risk of a Type-2 error, and not a Type-1 error. +// // NB: the the_type variable is not used, but we have kept it for // backwards compatibility. It's probably not used by anyone though; @@ -154,26 +211,28 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() -#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \ - [&] { \ - const auto& the_type = TYPE; \ - /* don't use TYPE again in case it is an expensive or side-effect op */ \ - at::ScalarType _st = ::detail::scalar_type(the_type); \ - switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ - default: \ - AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ - } \ +#define AT_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Half, at::Half, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ }() #define AT_DISPATCH_FLOATING_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ @@ -181,10 +240,11 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, \ SCALARTYPE, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ @@ -199,14 +259,17 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE1, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE2, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ @@ -220,13 +283,20 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ @@ -238,14 +308,18 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ @@ -259,19 +333,28 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} [&] { \ const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ - at::ScalarType _st = ::detail::scalar_type(the_type); \ - switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE1, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE2, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ @@ -285,31 +368,36 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() -#define AT_DISPATCH_INTEGRAL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ - [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ +#define AT_DISPATCH_INTEGRAL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op */ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, \ SCALARTYPE, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() @@ -318,17 +406,18 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ - switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - default: \ - AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ - } \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ + } \ }() #define AT_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ @@ -336,11 +425,18 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ @@ -351,6 +447,7 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ AT_QINT_PRIVATE_CASE_TYPE( \ at::kQInt8, at::qint8, at::kChar, int8_t, __VA_ARGS__) \ @@ -368,6 +465,7 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ AT_QINT_SUB_BYTE_PRIVATE_CASE_TYPE( \ at::kQInt8, at::qint8, int8_t, CHAR_BIT, SCHAR_MIN, SCHAR_MAX, __VA_ARGS__) \ @@ -387,17 +485,18 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op*/ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, \ at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ + AT_PRIVATE_CASE_TYPE(NAME, \ at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ @@ -406,154 +505,196 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} #define AT_DISPATCH_ALL_TYPES_AND(SCALARTYPE, TYPE, NAME, ...) \ [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(SCALARTYPE, TYPE, NAME, ...) \ [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ + NAME, \ + at::ScalarType::ComplexFloat, \ + c10::complex, \ + __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ + NAME, \ + at::ScalarType::ComplexDouble, \ + c10::complex, \ + __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() #define AT_DISPATCH_ALL_TYPES_AND2(SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \ [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE1, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE2, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( \ SCALARTYPE1, SCALARTYPE2, TYPE, NAME, ...) \ [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE1, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE2, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", TYPE, "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() -#define AT_DISPATCH_ALL_TYPES_AND3( \ - SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ - [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ +#define AT_DISPATCH_ALL_TYPES_AND3( \ + SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ + [&] { \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE1, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE2, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE3, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() #define AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( \ SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, TYPE, NAME, ...) \ [&] { \ - switch (TYPE) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE( \ - at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ + const auto& the_type = TYPE; \ + /* don't use TYPE again in case it is an expensive or side-effect op*/ \ + at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ + switch (_st) { \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, at::ScalarType::ComplexFloat, c10::complex, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE( \ + NAME, at::ScalarType::ComplexDouble, c10::complex, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE1, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE2, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE( \ + NAME, \ SCALARTYPE3, \ decltype(c10::impl::ScalarTypeToCPPType::t), \ __VA_ARGS__) \ default: \ - AT_ERROR(#NAME, " not implemented for '", TYPE, "'"); \ + AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ }() @@ -562,15 +703,10 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_index_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _it = ::detail::scalar_type(the_index_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _it) \ switch (_it) { \ - case at::ScalarType::Int: { \ - using index_t = int32_t; \ - return __VA_ARGS__(); \ - } \ - case at::ScalarType::Long: { \ - using index_t = int64_t; \ - return __VA_ARGS__(); \ - } \ + AT_PRIVATE_CASE_TYPE_USING_HINT(NAME, at::ScalarType::Int, int32_t, index_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE_USING_HINT(NAME, at::ScalarType::Long, int64_t, index_t, __VA_ARGS__)\ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_it), "'"); \ } \ @@ -586,15 +722,16 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {} const auto& the_type = TYPE; \ /* don't use TYPE again in case it is an expensive or side-effect op */ \ at::ScalarType _st = ::detail::scalar_type(the_type); \ + RECORD_KERNEL_FUNCTION_DTYPE(NAME, _st); \ switch (_st) { \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Char, int8_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__) \ - AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Byte, uint8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Char, int8_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Double, double, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Float, float, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Int, int32_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Long, int64_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Short, int16_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE(NAME, at::ScalarType::Half, at::Half, __VA_ARGS__) \ default: \ AT_ERROR(#NAME, " not implemented for '", toString(_st), "'"); \ } \ diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index 7d13de185509..1d69af7c5622 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -47,10 +47,10 @@ void _dim_apply( auto values_dim_stride = values.stride(dim); auto indices_dim_stride = indices.stride(dim); auto dim_size = values.size(dim); - + AT_DISPATCH_ALL_TYPES_AND2( ScalarType::Bool, ScalarType::Half, iter.dtype(), - method_name, [&] { + "sorting_kernel_method_name", [&] { auto loop = [&](char** data, const int64_t* strides, int64_t n) { auto* values_data_bytes = data[0]; auto* indices_data_bytes = data[1]; @@ -68,7 +68,7 @@ void _dim_apply( indices_data_bytes += strides[1]; } }; - + iter.for_each(loop); } ); @@ -114,7 +114,7 @@ static void sort_kernel( auto composite_accessor = CompositeRandomAccessorCPU< decltype(values_accessor), decltype(indices_accessor) >(values_accessor, indices_accessor); - + if (descending) { std::sort(composite_accessor, composite_accessor + dim_size, KeyValueCompDesc()); diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu index 66ac81f5ecbf..ff3b5bb08baa 100644 --- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu +++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu @@ -192,7 +192,7 @@ struct cuda_scatter_gather_base_kernel { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), - method_name, [&] { + "cuda_scatter_gather_base_kernel_func", [&] { using dtype = typename std::conditional, scalar_t>::type; @@ -264,7 +264,7 @@ struct cuda_scatter_gather_base_kernel { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), - method_name, [&] { + "cuda_scatter_gather_base_kernel_reduce_multiply", [&] { using dtype = typename std::conditional, scalar_t>::type; @@ -365,7 +365,7 @@ struct cuda_scatter_fill_base_kernel { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, iter.dtype(), - method_name, [&] { + "cuda_scatter_fill_base_kernel_func", [&] { using dtype = typename std::conditional, scalar_t>::type; @@ -417,7 +417,7 @@ struct cuda_scatter_fill_base_kernel { AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), - method_name, [&] { + "cuda_scatter_fill_base_kernel_reduce_multiply", [&] { using dtype = typename std::conditional, scalar_t>::type; diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu index 6ba73e1c143e..8d497b5c94af 100644 --- a/aten/src/ATen/native/cuda/TriangularOps.cu +++ b/aten/src/ATen/native/cuda/TriangularOps.cu @@ -60,7 +60,7 @@ Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, c int64_t N = self.numel(); dim3 dim_block = cuda::getApplyBlock(); dim3 dim_grid((N + dim_block.x - 1) / dim_block.x); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), name, [&]{ + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, self.scalar_type(), "triu_tril_cuda_template", [&]{ if (cuda::detail::canUse32BitIndexMath(result) && cuda::detail::canUse32BitIndexMath(self)) { auto result_info = cuda::detail::getTensorInfo(result); auto self_info = cuda::detail::getTensorInfo(self); diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index 4b07d13aa747..43c2d878840d 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -23,6 +23,8 @@ enum class C10_API_ENUM RecordScope : uint8_t { BACKWARD_FUNCTION, // TorchScript functions, methods TORCHSCRIPT_FUNCTION, + // Kernel Function dtype Tag + KERNEL_FUNCTION_DTYPE, // User defined scope (e.g. with record_function()) USER_SCOPE, NUM_SCOPES, // must be the last in the list From c0a0845019f8951645cd2bd7fe5663d5cf552dfe Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Fri, 11 Dec 2020 10:25:47 -0800 Subject: [PATCH 177/250] Improve new_group example in the context of SyncBatchNorm (#48897) Summary: Closes https://github.com/pytorch/pytorch/issues/48804 Improves some documentation/example in SyncBN docs to clearly show that each rank must call into all `new_group()` calls for creating process subgroups, even if they are not going to be part of that particular subgroup. We then pick the right group, i.e. the group that the rank is part of, and pass that into the SyncBN APIs. Doc rendering: syncbn_update Pull Request resolved: https://github.com/pytorch/pytorch/pull/48897 Reviewed By: zou3519 Differential Revision: D25493181 Pulled By: rohan-varma fbshipit-source-id: a7e93fc8cc07ec7797e5dbc356f1c3877342cfa3 --- torch/nn/modules/batchnorm.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/torch/nn/modules/batchnorm.py b/torch/nn/modules/batchnorm.py index e76e307d36a6..48e58d637ea6 100644 --- a/torch/nn/modules/batchnorm.py +++ b/torch/nn/modules/batchnorm.py @@ -434,8 +434,14 @@ class SyncBatchNorm(_BatchNorm): >>> # With Learnable Parameters >>> m = nn.SyncBatchNorm(100) >>> # creating process group (optional) - >>> # process_ids is a list of int identifying rank ids. - >>> process_group = torch.distributed.new_group(process_ids) + >>> # ranks is a list of int identifying rank ids. + >>> ranks = list(range(8)) + >>> r1, r2 = ranks[:4], ranks[4:] + >>> # Note: every rank calls into new_group for every + >>> # process group created, even if that rank is not + >>> # part of the group. + >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]] + >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1] >>> # Without Learnable Parameters >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group) >>> input = torch.randn(20, 100, 35, 45, 10) @@ -564,8 +570,14 @@ def convert_sync_batchnorm(cls, module, process_group=None): >>> torch.nn.BatchNorm1d(100), >>> ).cuda() >>> # creating process group (optional) - >>> # process_ids is a list of int identifying rank ids. - >>> process_group = torch.distributed.new_group(process_ids) + >>> # ranks is a list of int identifying rank ids. + >>> ranks = list(range(8)) + >>> r1, r2 = ranks[:4], ranks[4:] + >>> # Note: every rank calls into new_group for every + >>> # process group created, even if that rank is not + >>> # part of the group. + >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]] + >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1] >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group) """ From 42c78ed74525f03f6bb43110784f07c5a6ef1bef Mon Sep 17 00:00:00 2001 From: Tugsbayasgalan Manlaibaatar Date: Fri, 11 Dec 2020 10:58:54 -0800 Subject: [PATCH 178/250] Tuple Slice with both negative and positive stepped size (#48660) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48660 We used to support tuple slicing without any step size before, but this PR extends this feature to support arbitrary step size. We do this by manually reconstructing a new tuple in the IR instead of relying on TupleSlice prim. Test Plan: python tests Imported from OSS Reviewed By: gmagogsfm Differential Revision: D25359336 fbshipit-source-id: 28cde536f28dd8a00607814b2900765e177f0ed7 --- test/test_jit.py | 26 +++++- tools/build_variables.bzl | 1 + torch/csrc/jit/frontend/ir_emitter.cpp | 87 ++++++++++++------- torch/csrc/jit/ir/ir.cpp | 30 ++++--- torch/csrc/jit/ir/ir.h | 6 +- .../csrc/jit/runtime/slice_indices_adjust.cpp | 56 ++++++++++++ torch/csrc/jit/runtime/slice_indices_adjust.h | 28 ++++++ 7 files changed, 186 insertions(+), 48 deletions(-) create mode 100644 torch/csrc/jit/runtime/slice_indices_adjust.cpp create mode 100644 torch/csrc/jit/runtime/slice_indices_adjust.h diff --git a/test/test_jit.py b/test/test_jit.py index 65b9c110f64f..239e4660674b 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -12128,10 +12128,10 @@ def tuple_slice(a): scripted_fn = torch.jit.script(tuple_slice) self.assertEqual(scripted_fn(torch.tensor(1)), (2, 3)) tuple_graph = scripted_fn.graph - slices = tuple_graph.findAllNodes("prim::TupleSlice") + slices = tuple_graph.findAllNodes("prim::TupleConstruct") num_outputs = set(len(x.output().type().elements()) for x in slices) - # one tuple slice should have an output with 2 elements, other 4 - self.assertTrue(num_outputs == {2, 4}) + # there should be only one tupleSlice with length of 2 + self.assertTrue(num_outputs == {2}) self.run_pass('lower_all_tuples', tuple_graph) self.assertTrue('Tuple' not in str(tuple_graph)) @@ -12142,6 +12142,26 @@ def test_indexing_end_out_of_bounds(): self.assertEqual(test_indexing_end_out_of_bounds(), ()) + def test_stepped_tuple_slicing(self): + + def check_slicing_tuple(slicing, tuple_type, tuple): + template = dedent(""" + def func(x): + # type: ({}) -> Any + return x{} + """) + self._check_code(template.format(tuple_type, slicing), "func", [tuple]) + + check_slicing_tuple("[-3:3:2]", "Tuple[int, int, int]", (0, 1, 2)) + check_slicing_tuple("[::55]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4)) + check_slicing_tuple("[:4:4]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4)) + check_slicing_tuple("[::-1]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6)) + check_slicing_tuple("[7:5:2]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6)) + check_slicing_tuple("[5:7:-2]", "Tuple[int, int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5, 6)) + check_slicing_tuple("[::-2]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4)) + check_slicing_tuple("[:4:-3]", "Tuple[int, int, int, int, int, int]", (0, 1, 2, 3, 4, 5)) + check_slicing_tuple("[3::-2]", "Tuple[int, int, int, int, int]", (0, 1, 2, 3, 4)) + def test_lower_nested_tuples(self): @torch.jit.script def test(): diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 9121b7c84537..8b6374e9d71c 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -96,6 +96,7 @@ core_sources_common = [ "torch/csrc/jit/runtime/jit_exception.cpp", "torch/csrc/jit/runtime/operator.cpp", "torch/csrc/jit/runtime/print_handler.cpp", + "torch/csrc/jit/runtime/slice_indices_adjust.cpp", "torch/csrc/jit/runtime/register_ops_utils.cpp", "torch/csrc/jit/runtime/vararg_functions.cpp", "torch/csrc/jit/serialization/unpickler.cpp", diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index a21041343eee..02ead1d6fa80 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -3443,7 +3444,25 @@ struct to_ir { } else { AT_ASSERT(!sliceable->type()->isSubtypeOf(TensorType::get())); } + // TODO for now let's deal with TupleType first. Ideally all list, tensor, + // string, and tuple slicing should be same (tugsbayasgalan) + if (sliceable->type()->cast()) { + std::vector> tuple_args; + // since we are only dealing with tuple slicing for now, we try to keep + // tuple args seperate for now + tuple_args.reserve(3); + + start ? tuple_args.emplace_back(start) + : tuple_args.emplace_back(c10::nullopt); + end ? tuple_args.emplace_back(end) + : tuple_args.emplace_back(c10::nullopt); + step ? tuple_args.emplace_back(step) + : tuple_args.emplace_back(c10::nullopt); + + return emitTupleSlice(loc, args[0], tuple_args); + } + // TODO this needs to be cleaned for list slicing // Default value for start is 0. if (!start) { start = graph->insertConstant(0, loc); @@ -3453,19 +3472,6 @@ struct to_ir { if (end) { args.emplace_back(loc, "end", end); } - if (sliceable->type()->cast()) { - if (step) { - // TODO: add support for slicing tuples with a step - throw ErrorReport(loc) - << "Unsupported operation: slicing tuples with a step isn't supported"; - } - - if (end) { - return emitTupleSlice(loc, args[0], args[1], /*end*/ args[2]); - } else { - return emitTupleSlice(loc, args[0], args[1], c10::nullopt); - } - } if (!step) { step = graph->insertConstant(1, loc); @@ -3828,28 +3834,37 @@ struct to_ir { Value* emitTupleSlice( const SourceRange& loc, const NamedValue& tuple_val, - const NamedValue& beg_val, - const at::optional& end_val) { + const std::vector>& tuple_args) { auto tuple_type = tuple_val.value(*graph)->type()->expect(); - int64_t beg = getAdjTupleIndex( - loc, - tuple_type, - getSliceInd(beg_val.value(*graph), loc), - /*allow_out_of_bounds*/ true); - int64_t end; int64_t tuple_len = tuple_type->elements().size(); + auto beg_val = tuple_args[0]; + auto end_val = tuple_args[1]; + auto step = tuple_args[2]; + + int64_t step_size = 1; + if (step) { + auto val = toIValue(step->value(*graph)); + TORCH_CHECK(val->isInt(), "Step size should always be an integer"); + step_size = val->to(); + } + + int64_t beg = std::numeric_limits::max(); + if (beg_val) { + beg = getAdjTupleIndex( + loc, tuple_type, getSliceInd(beg_val->value(*graph), loc), true); + } + + int64_t end = std::numeric_limits::max(); if (end_val) { end = getAdjTupleIndex( loc, tuple_type, getSliceInd(end_val->value(*graph), loc), true); - } else { - end = tuple_len; } - // slicing does not throw out of bounds errors - end = std::min(std::max((int64_t)0, end), tuple_len); - beg = std::min(std::max((int64_t)0, beg), tuple_len); + + int64_t num_values = slice_indices_adjust(tuple_len, &beg, &end, step_size); return graph - ->insertNode(graph->createTupleSlice(tuple_val.value(*graph), beg, end)) + ->insertNode(graph->createTupleSlice( + tuple_val.value(*graph), beg, step_size, num_values)) ->output(); } @@ -3873,19 +3888,25 @@ struct to_ir { auto s_tuple_val = sv->asTupleValue(val_range, method)->asValue(val_range, method); const SliceExpr& slice = SliceExpr(subscript_exprs[0]); + std::vector> tuple_args; + tuple_args.reserve(3); auto begin = NamedValue(val_range, "begin", emitExpr(Expr(slice.startOr(0)))); + tuple_args.emplace_back(begin); if (slice.end().present()) { auto end = NamedValue(val_range, "end", emitExpr(Expr(slice.end().get()))); - auto tupleSliceValue = - emitTupleSlice(val_range, s_tuple_val, begin, end); - return std::make_shared(tupleSliceValue); + tuple_args.emplace_back(end); + } else { - auto tupleSliceValue = - emitTupleSlice(val_range, s_tuple_val, begin, c10::nullopt); - return std::make_shared(tupleSliceValue); + tuple_args.emplace_back(c10::nullopt); } + // pushing step_size to match the tuple_args + tuple_args.emplace_back(c10::nullopt); + + auto tupleSliceValue = + emitTupleSlice(val_range, s_tuple_val, tuple_args); + return std::make_shared(tupleSliceValue); } else { return std::make_shared(emitBasicSlice( range, sv->asValue(val_range, method), subscript_exprs)); diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp index 4714a6ae12f6..65b410d82069 100644 --- a/torch/csrc/jit/ir/ir.cpp +++ b/torch/csrc/jit/ir/ir.cpp @@ -1606,17 +1606,25 @@ Node* Graph::createTupleIndex( return n; } -Node* Graph::createTupleSlice(Value* tup, int64_t beg, int64_t end) { - auto n = create(prim::TupleSlice, {tup}); - auto tuple_type = tup->type()->expect(); - n->i_(attr::beg, beg); - n->i_(attr::end, end); - std::vector output_types; - for (auto i = beg; i < end; ++i) { - output_types.push_back(tuple_type->elements().at(i)); - } - auto tt = TupleType::create(std::move(output_types)); - n->output()->setType(tt); +Node* Graph::createTupleSlice( + Value* tup, + int64_t beg, + int64_t step_size, + int64_t num_values) { + std::vector new_vals; + TupleTypePtr tt = tup->type()->expect(); + new_vals.reserve(num_values); + + int64_t i = beg; + for (int64_t j = 0; j < num_values; ++j) { + auto idx = insertConstant(IValue(static_cast(i))); + auto tupleIndex = insertNode(createTupleIndex(tup, idx, tt->elements()[i])); + + new_vals.push_back(tupleIndex->output()); + i += step_size; + } + + auto n = createTuple(new_vals); return n; } diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h index b20d5611c55c..7587451d9fd4 100644 --- a/torch/csrc/jit/ir/ir.h +++ b/torch/csrc/jit/ir/ir.h @@ -1122,7 +1122,11 @@ struct Graph { Value* tup, Value* idx, const TypePtr& output_type); - TORCH_API Node* createTupleSlice(Value* tup, int64_t beg, int64_t end); + TORCH_API Node* createTupleSlice( + Value* tup, + int64_t beg, + int64_t step_size, + int64_t num_values); TORCH_API Node* createEnumName(Value* e); TORCH_API Node* createEnumValue(Value* e); TORCH_API Node* createList( diff --git a/torch/csrc/jit/runtime/slice_indices_adjust.cpp b/torch/csrc/jit/runtime/slice_indices_adjust.cpp new file mode 100644 index 000000000000..e71d6ba94c9a --- /dev/null +++ b/torch/csrc/jit/runtime/slice_indices_adjust.cpp @@ -0,0 +1,56 @@ +#include +#include +#include + +namespace torch { +namespace jit { + +int64_t slice_indices_adjust( + int64_t length, + int64_t* start, + int64_t* stop, + int64_t step) { + TORCH_CHECK(step != 0, "List slice should have non-zero step") + TORCH_CHECK(step >= -INT64_MAX, "List slice step is out of bounds") + + // Comes from PySlice_Unpack. + if (*start == INT64_MAX) { + *start = (step < 0) ? INT64_MAX : 0; + } + if (*stop == INT64_MAX) { + *stop = (step < 0) ? INT64_MIN : INT64_MAX; + } + + // Comes from PySlice_AdjustIndices. + if (*start < 0) { + *start += length; + if (*start < 0) { + *start = (step < 0) ? -1 : 0; + } + } else if (*start >= length) { + *start = (step < 0) ? length - 1 : length; + } + + if (*stop < 0) { + *stop += length; + if (*stop < 0) { + *stop = (step < 0) ? -1 : 0; + } + } else if (*stop >= length) { + *stop = (step < 0) ? length - 1 : length; + } + + if (step < 0) { + if (*stop < *start) { + return (*start - *stop - 1) / (-step) + 1; + } + } else { + if (*start < *stop) { + return (*stop - *start - 1) / step + 1; + } + } + return 0; +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/slice_indices_adjust.h b/torch/csrc/jit/runtime/slice_indices_adjust.h new file mode 100644 index 000000000000..ea1e9511769d --- /dev/null +++ b/torch/csrc/jit/runtime/slice_indices_adjust.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +namespace torch { +namespace jit { + +// Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +// 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software +// Foundation; All Rights Reserved +// +// Stolen (with appropriate modifications) by @agolynski +// (https://github.com/pytorch/pytorch/pull/33019) from cpython repo +// Objects/sliceobject.c with comment: this is harder to get right than you +// might think +// +// This adjusts indexes according to python list semantics and returns number +// of elements in the resulting list. +TORCH_API int64_t slice_indices_adjust( + int64_t length, + int64_t* start, + int64_t* stop, + int64_t step); + +} // namespace jit +} // namespace torch From f965b0fcfbcfac0a4cd699c8336ad271be86811e Mon Sep 17 00:00:00 2001 From: Shijun Kong Date: Fri, 11 Dec 2020 11:15:53 -0800 Subject: [PATCH 179/250] Expose run_async function on torch::jit::Method (#48607) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48607 This change builds on top of https://github.com/pytorch/pytorch/pull/46865 further exposing the async interface to `torch::jit::Method`. added unit test for new `run_async` Test Plan: `buck test caffe2/test/cpp/jit/...` Reviewed By: dzhulgakov Differential Revision: D25219726 fbshipit-source-id: 89743c82a0baa1affe0254c1e2dbf873de8e5c76 --- test/cpp/jit/test_module_api.cpp | 38 ++++++++++++++++++++++++++++++++ torch/csrc/jit/api/method.h | 9 ++++++++ torch/csrc/jit/api/module.cpp | 11 +++++++++ 3 files changed, 58 insertions(+) diff --git a/test/cpp/jit/test_module_api.cpp b/test/cpp/jit/test_module_api.cpp index 910331166d51..c77d89af5afa 100644 --- a/test/cpp/jit/test_module_api.cpp +++ b/test/cpp/jit/test_module_api.cpp @@ -43,6 +43,44 @@ static void import_libs( si.loadType(QualifiedName(class_name)); } +TEST(ModuleAPITest, MethodRunAsync) { + // Module m("m"); + // m.define(R"( + // def forward(self): + // r1 = torch.jit.fork(torch.mm, torch.rand(100,100),torch.rand(100,100)) + // r2 = torch.jit.fork(torch.mm, torch.rand(100,100),torch.rand(100,100)) + // return r1.wait() + r2.wait() + // )"); + std::string filePath(__FILE__); + auto testModelFile = filePath.substr(0, filePath.find_last_of("/\\") + 1); + // borrow model file from TEST(GraphExecutorTest, runAsync_executor) + testModelFile.append("test_interpreter_async.pt"); + auto m = load(testModelFile); + + auto counter = 0; + std::mutex mtx; + + auto launcher = [&](std::function f) { + mtx.lock(); + ++counter; + mtx.unlock(); + at::launch(move(f)); + }; + + auto method = m.get_method("forward"); + + std::vector stack; + auto kwargs = std::unordered_map(); + auto future = method.run_async(stack, kwargs, launcher); + + future->wait(); + + // expect 2 forks and 2 wait callbacks being excuted on provided taskLauncher + // but ivalue::Future would be marked completed and release wait before + // finishing all callbacks + ASSERT_GE(counter, 2); +} + TEST(ModuleAPITest, Clone) { auto cu = std::make_shared(); // creating child module diff --git a/torch/csrc/jit/api/method.h b/torch/csrc/jit/api/method.h index 1d0ea9bce2c8..96b632b6b111 100644 --- a/torch/csrc/jit/api/method.h +++ b/torch/csrc/jit/api/method.h @@ -31,6 +31,15 @@ struct TORCH_API Method { std::vector stack, const Kwargs& kwargs = Kwargs()); + // Run method async. Invocation on this function would invokes a JIT + // interpreter that executes ops inline, one by one, on caller's thread. A + // model can utilize async op, i.e. `fork`, to launch an asynchronous task + // which will be launched on provided `taskLauncher`. + c10::intrusive_ptr run_async( + std::vector stack, + const Kwargs& kwargs = Kwargs(), + TaskLauncher taskLauncher = at::launch); + std::shared_ptr graph() const { return function_->graph(); } diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp index 04eafc3d0f5d..d74905b5d3f0 100644 --- a/torch/csrc/jit/api/module.cpp +++ b/torch/csrc/jit/api/module.cpp @@ -118,6 +118,17 @@ IValue Method::operator()(std::vector stack, const Kwargs& kwargs) { return (*function_)(std::move(stack), kwargs); } +c10::intrusive_ptr Method::run_async( + std::vector stack, + const Kwargs& kwargs, + TaskLauncher taskLauncher) { + stack.insert(stack.begin(), owner()._ivalue()); + RECORD_TORCHSCRIPT_FUNCTION(name(), stack); + + function_->getSchema().checkAndNormalizeInputs(stack, kwargs); + return function_->runAsync(stack, std::move(taskLauncher)); +} + void Module::clone_method( const Module& orig, const Function& method, From 796b267763dee6e3451dacbf1c22c77b98ef91d9 Mon Sep 17 00:00:00 2001 From: Alexander Golynski Date: Fri, 11 Dec 2020 12:03:52 -0800 Subject: [PATCH 180/250] fix backwards compatibility for #48711 and its revert (#49240) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49240 Test Plan: Imported from OSS Reviewed By: heitorschueroff Differential Revision: D25500727 Pulled By: agolynski fbshipit-source-id: 6a690f52fe671267862b159b6330d37ef08ee291 --- test/backward_compatibility/check_backward_compatibility.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index ccb4a6457537..e155537d7b99 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -187,6 +187,8 @@ ("aten::ifft", datetime.date(2021, 1, 31)), ("aten::irfft", datetime.date(2021, 1, 31)), ("aten::rfft", datetime.date(2021, 1, 31)), + ("aten::quantile", datetime.date(2021, 1, 31)), + ("aten::nanquantile", datetime.date(2021, 1, 31)), ] def allow_listed(schema, allow_list): From 2a3bb1cea0f2b070e66f032d26082a7a38e0e217 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Fri, 11 Dec 2020 12:11:13 -0800 Subject: [PATCH 181/250] [quant][graphmode][fx][fix] Fix typo in fusion (#49183) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49183 Test Plan: Imported from OSS Reviewed By: hx89 Differential Revision: D25473367 fbshipit-source-id: 0cd5e6769eeea0923dd104ea90b0192e3475b3ad --- torch/quantization/fx/fuse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py index b5cf78b05f33..5aabbd66c4b1 100644 --- a/torch/quantization/fx/fuse.py +++ b/torch/quantization/fx/fuse.py @@ -35,7 +35,7 @@ def fuse(self, model: GraphModule, self.modules = dict(input_root.named_modules()) additional_fusion_patterns = \ - fuse_custom_config_dict.get("additional_quant_pattern", {}) + fuse_custom_config_dict.get("additional_fusion_pattern", {}) fusion_patterns = get_combined_dict( get_default_fusion_patterns(), additional_fusion_patterns) # find fusion From 1cb5aa6c6039038095f3a505d7f3bdb5a2d6a1d4 Mon Sep 17 00:00:00 2001 From: Sebastian Messmer Date: Fri, 11 Dec 2020 12:36:15 -0800 Subject: [PATCH 182/250] Fix structured kernel codegen (#49244) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49244 see https://fb.quip.com/ceEdANd5iVsO RegisterMkldnnCPU kernels incorrectly used makeUnboxedOnly() calls to register add_.Tensor kernels. This is because the codegen incorrectly thought they're not c10-full. This PR fixes that. ghstack-source-id: 118411117 Test Plan: After this PR, RegisterMkldnnCPU doesn't contain the makeUnboxedOnly() calls anymore. Reviewed By: ezyang Differential Revision: D25500246 fbshipit-source-id: 8a8c2be9c4f4a5ce7eaae94257c2f8cbd176e92e --- tools/codegen/gen.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index af3ebbf674f4..8c22c1fe702c 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -471,6 +471,7 @@ def gen_one(f: NativeFunction) -> Optional[str]: return list(mapMaybe(gen_one, g.functions())) + @method_with_native_function def gen_unstructured(self, f: NativeFunction) -> Optional[str]: # for mypy type refinement; would be fixed by TODO on target assert self.target is not Target.DECLARATION From db5e5b439c454d657cfa8f08a096cf68e203f2a8 Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Fri, 11 Dec 2020 12:51:43 -0800 Subject: [PATCH 183/250] Extra sampling of record function events [resend] (#49114) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49114 resend of https://github.com/pytorch/pytorch/pull/48289 Test Plan: see 48289 Reviewed By: robieta Differential Revision: D25443365 Pulled By: ilia-cher fbshipit-source-id: c15ac312222bb4d744e10199ed79801cccae8227 --- aten/src/ATen/ThreadLocalState.cpp | 1 + aten/src/ATen/ThreadLocalState.h | 24 ++++- aten/src/ATen/core/dispatch/Dispatcher.h | 81 +++++++++------- aten/src/ATen/record_function.cpp | 117 ++++++++++++++++++----- aten/src/ATen/record_function.h | 27 +++++- binaries/record_function_benchmark.cc | 101 +++++++++---------- torch/csrc/autograd/function.h | 39 ++++---- torch/csrc/jit/runtime/interpreter.cpp | 5 +- 8 files changed, 268 insertions(+), 127 deletions(-) diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 6d74e2f47ce0..3c7b9b6ff5bc 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -19,6 +19,7 @@ ThreadLocalState::ThreadLocalState(bool keep_grad_mode) grad_mode_enabled_ = GradMode::is_enabled(); } #endif + bumped_record_all_functions_ = at::checkRecordAllFunctions(); } /* static */ diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index f0cb85f0ff84..3c9b55b3d8d6 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -38,6 +38,9 @@ class TORCH_API ThreadLocalState { bool grad_mode_enabled_; #endif + // Whether pre-sampling RecordFunction optimization was enabled + bool bumped_record_all_functions_ = false; + friend class ThreadLocalStateGuard; }; @@ -45,7 +48,21 @@ class TORCH_API ThreadLocalState { class TORCH_API ThreadLocalStateGuard { public: explicit ThreadLocalStateGuard(const ThreadLocalState& state) - : prev_state_(ThreadLocalState()) { + : prev_state_(ThreadLocalState()), + bumped_record_all_functions_(state.bumped_record_all_functions_) { + // Special handling of RecordFunction pre-sampling optimization: + // pre-samping is enabled (bumped) when there're non-sampled + // (or high-frequency) global or TLS callbacks. + // + // ThreadLocalStateGuard simply resets RecordFunction's TLS and + // hence its thread local callbacks. + // + // Checking if the pre-sampling was enabled and preserving it in the + // async task by calling bumpRecordAllFunctions() and the corresponding + // releaseRecordAllFunctions() + if (bumped_record_all_functions_) { + at::bumpRecordAllFunctions(); + } // set the given state across the thread boundary ThreadLocalState::setThreadLocalState(state); } @@ -53,10 +70,15 @@ class TORCH_API ThreadLocalStateGuard { ~ThreadLocalStateGuard() { // restore previously set variables ThreadLocalState::setThreadLocalState(prev_state_); + if (bumped_record_all_functions_) { + at::releaseRecordAllFunctions(); + } } private: const ThreadLocalState prev_state_; + // Whether pre-sampling RecordFunction optimization was enabled + bool bumped_record_all_functions_ = false; }; template diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 632739053c42..f83302e2d819 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -371,28 +371,39 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandleop.lookup(dispatchKey); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - // Check if we need to run callbacks registered with RecordFunction - // If true and callbacks need inputs, we box the arguments and pass - // them into the callbacks and also into the kernel call - - // Note: for perf reasons we wouldn't want to pass arguments into - // the function call or prematurely box them - at::RecordFunction guard(at::RecordScope::FUNCTION); - if (C10_UNLIKELY(guard.isActive())) { - if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) { - int64_t seq_num = -1; - // Setting sequence number in the Autograd case to associate - // the forward range with the coresponding Autograd's node - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { - seq_num = at::sequence_number::peek(); - } - if (guard.needsInputs()) { - torch::jit::Stack stack = impl::boxArgs(args...); - guard.before(op, stack, seq_num); - } else { - guard.before(op, seq_num); + // By default, when there're no high-frequency or non-sampled callbacks, + // RecordFunction is pre-sampled as a perf optimization; + // shouldRunRecordFunction checks whether RecordFunction should be executed, + // and sets pre_sampled boolean argument value to whether pre-sampling was used - + // this boolean is passed into RecordFunction to adjust the sampling rates of + // the callbacks + bool pre_sampled = false; + if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { + // Check if we need to run callbacks registered with RecordFunction + // If true and callbacks need inputs, we box the arguments and pass + // them into the callbacks and also into the kernel call + + // Note: for perf reasons we wouldn't want to pass arguments into + // the function call or prematurely box them + at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); + if (C10_UNLIKELY(guard.isActive())) { + if (shouldRecord(dispatchKey) && op.operatorIterator_->op.isObserved()) { + int64_t seq_num = -1; + // Setting sequence number in the Autograd case to associate + // the forward range with the coresponding Autograd's node + if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + seq_num = at::sequence_number::peek(); + } + if (guard.needsInputs()) { + torch::jit::Stack stack = impl::boxArgs(args...); + guard.before(op, stack, seq_num); + } else { + guard.before(op, seq_num); + } } } + // keeping the guard alive while executing the kernel + return kernel.template call(op, std::forward(args)...); } #endif // PYTORCH_DISABLE_PER_OP_PROFILING return kernel.template call(op, std::forward(args)...); @@ -429,20 +440,26 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const const auto& kernel = entry.lookup(dispatchKey); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING - // using already existing stack to record function execution in observers - at::RecordFunction guard(at::RecordScope::FUNCTION); - if (C10_UNLIKELY(guard.isActive())) { - if (shouldRecord(dispatchKey) && entry.isObserved()) { - int64_t seq_num = -1; - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { - seq_num = at::sequence_number::peek(); - } - if (guard.needsInputs()) { - guard.before(op, *stack, seq_num); - } else { - guard.before(op, seq_num); + bool pre_sampled = false; + if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { + // using already existing stack to record function execution in observers + at::RecordFunction guard(at::RecordScope::FUNCTION, pre_sampled); + if (C10_UNLIKELY(guard.isActive())) { + if (shouldRecord(dispatchKey) && entry.isObserved()) { + int64_t seq_num = -1; + if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + seq_num = at::sequence_number::peek(); + } + if (guard.needsInputs()) { + guard.before(op, *stack, seq_num); + } else { + guard.before(op, seq_num); + } } } + // keeping the guard alive while executing the kernel + kernel.callBoxed(op, stack); + return; } #endif // PYTORCH_DISABLE_PER_OP_PROFILING kernel.callBoxed(op, stack); diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index 102931fd4aa7..d1b0acb87c28 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -30,8 +30,6 @@ std::atomic defaultNodeId(-1); std::atomic next_thread_id_ {0}; thread_local uint64_t current_thread_id_ = 0; -thread_local bool tls_record_function_enabled_ = true; - // Low probability constant static const double kLowProb = 0.001; struct CoinflipTLS { @@ -68,6 +66,10 @@ void set_record_function_tls_(const RecordFunctionTLS& tls) { class CallbackManager { public: CallbackHandle addThreadLocalCallback(RecordFunctionCallback cb) { + if (cb.samplingProb() > kLowProb) { + // pre-sampling of RecordFunction with prob. kLowProb cannot be used + at::bumpRecordAllFunctions(); + } // note: monotonically increasing callbacks_unique_id keeps // sorted_tls_callbacks_ sorted auto handle = next_unique_callback_handle(); @@ -76,6 +78,10 @@ class CallbackManager { } CallbackHandle addGlobalCallback(RecordFunctionCallback cb) { + if (cb.samplingProb() > kLowProb) { + // pre-sampling of RecordFunction with prob. kLowProb cannot be used + at::bumpRecordAllFunctions(); + } auto handle = next_unique_callback_handle(); sorted_global_callbacks_.emplace_back(std::move(cb), handle); return handle; @@ -92,6 +98,10 @@ class CallbackManager { return el.second == handle; }); if (it != cbs.end()) { + if (it->first.samplingProb() > kLowProb) { + // try to restore pre-sampling of RecordFunction + at::releaseRecordAllFunctions(); + } // keeps it sorted cbs.erase(it); return true; @@ -127,7 +137,13 @@ class CallbackManager { // callbackShouldRun is even hotter because it's called multiple // times per init(). Profiling shows that the function prologue is // taking up a significant fraction of the time. - static bool C10_ALWAYS_INLINE callbackShouldRun(const RecordFunctionCallback& cb, RecordScope scope) { + static bool C10_ALWAYS_INLINE callbackShouldRun( + const RecordFunctionCallback& cb, RecordScope scope, bool pre_sampled) { + TORCH_INTERNAL_ASSERT( + !pre_sampled || (cb.sampling_prob_ <= kLowProb), + "Incorrect usage of a pre-sampled RecordFunction with a high-frequency " + " or non-sampled callback"); + // first check whether this callback is interested in // the given scope type if (!cb.checkScope(scope)) { @@ -138,36 +154,45 @@ class CallbackManager { return cb.should_run_(cb); } - if (cb.sampling_prob_ == 1.0) { - return true; + // otherwise potentially do the sampling + double sampling_prob = cb.sampling_prob_; + if (pre_sampled) { + // adjust the sampling rate to account for kLowProb pre-sampling of + // the RecordFunction + sampling_prob /= kLowProb; } - // model the low probability events as events happening - // with probability kLowProb followed by another sampling with - // probability (sampling_prob__ / kLowProb), then replace the coin - // flip for kLowProb with a thread local number of tries tries_left_ - // sampled from the geometric distribution. - if (cb.sampling_prob_ < kLowProb) { - if (coinflip_tls_.tries_left_ == 0) { - coinflip_tls_.tries_left_ = sample_geometric(); - return (sample_zero_one() < cb.sampling_prob_ / kLowProb); + + if (sampling_prob < 1.0) { + // model the low probability events as events happening + // with probability kLowProb followed by another sampling with + // probability (sampling_prob / kLowProb), then replace the coin + // flip for kLowProb with a thread local number of tries tries_left_ + // sampled from the geometric distribution. + if (sampling_prob < kLowProb) { + if (coinflip_tls_.tries_left_ == 0) { + coinflip_tls_.tries_left_ = sample_geometric(); + return (sample_zero_one() < sampling_prob / kLowProb); + } else { + --coinflip_tls_.tries_left_; + return false; + } } else { - --coinflip_tls_.tries_left_; - return false; + return (sample_zero_one() < sampling_prob); } - } else { - return (sample_zero_one() < cb.sampling_prob_); } + + return true; } // init is called by RecordFunction in constructor to // determine which thread local and global callbacks are going // to be executed and whether any of them need inputs - inline void init(RecordFunction& rec_fn, RecordScope scope) { + inline void init(RecordFunction& rec_fn, RecordScope scope, bool pre_sampled) { bool found_needs_inputs = false; bool found_needs_ids = false; for (const auto& cb: rf_tls_.sorted_tls_callbacks_) { - if (callbackShouldRun(cb.first, scope)) { + if (callbackShouldRun(cb.first, scope, pre_sampled)) { if (cb.first.needsInputs()) { found_needs_inputs = true; } @@ -182,7 +207,7 @@ class CallbackManager { } for (const auto& cb: sorted_global_callbacks_) { - if (callbackShouldRun(cb.first, scope)) { + if (callbackShouldRun(cb.first, scope, pre_sampled)) { if (cb.first.needsInputs()) { found_needs_inputs = true; } @@ -308,7 +333,6 @@ namespace { } } // namespace - RecordFunctionCallbacks _getTLSCallbacks() { return rf_tls_.sorted_tls_callbacks_; } @@ -374,12 +398,12 @@ void enableRecordFunction(bool enable) { rf_tls_.tls_record_function_enabled_ = enable; } -RecordFunction::RecordFunction(RecordScope scope) { +RecordFunction::RecordFunction(RecordScope scope, bool pre_sampled) { auto* rf_tls_ptr = &rf_tls_; if (rf_tls_ptr->tls_record_function_enabled_) { auto& m = manager(); if (!m.sorted_global_callbacks_.empty() || !rf_tls_ptr->sorted_tls_callbacks_.empty()) { - m.init(*this, scope); + m.init(*this, scope, pre_sampled); } } } @@ -451,4 +475,49 @@ void RecordFunction::end() { } } +// RecordFunction pre-sampling +namespace { +// Whether to try to create RecordFunction on each call (>0) or +// use pre-sampling (=0) +std::atomic global_record_all_functions_ {0}; +} + +void bumpRecordAllFunctions() { + global_record_all_functions_.fetch_add(1, std::memory_order_relaxed); +} + +void releaseRecordAllFunctions() { + TORCH_CHECK(global_record_all_functions_.fetch_sub(1, std::memory_order_relaxed) >= 0); +} + +bool checkRecordAllFunctions() { + return (global_record_all_functions_.load(std::memory_order_relaxed) > 0); +} + +bool shouldRunRecordFunction(bool* pre_sampled) { + auto* rf_tls_ptr = &rf_tls_; + if (rf_tls_ptr->sorted_tls_callbacks_.empty() && !manager().hasGlobalCallbacks()) { + *pre_sampled = false; + return false; + } + if (global_record_all_functions_.load(std::memory_order_relaxed) > 0) { + *pre_sampled = false; + return true; + } + if (!rf_tls_ptr->tls_record_function_enabled_) { + *pre_sampled = false; + return false; + } + + *pre_sampled = true; + auto* coinflip_tls_ptr = &coinflip_tls_; + if (coinflip_tls_ptr->tries_left_ == 0) { + coinflip_tls_ptr->tries_left_ = sample_geometric(); + return true; + } else { + --coinflip_tls_ptr->tries_left_; + return false; + } +} + } // namespace at diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index 43c2d878840d..bcd0fbc37e77 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -92,8 +92,11 @@ typedef uint64_t RecordFunctionHandle; struct TORCH_API RecordFunction { // Default constructor is used with before function called afterwards: // scope - record scope that this function tracks + // pre_sampled - whether this RecordFunction was already pre-sampled with + // kLowProb probability RecordFunction( - RecordScope scope = RecordScope::FUNCTION); + RecordScope scope = RecordScope::FUNCTION, + bool pre_sampled = false); template void before( @@ -240,6 +243,9 @@ struct TORCH_API RecordFunction { // flag is used to check whether the start callbacks were called bool called_start_callbacks_ = false; + // Whether the RecordFunction is pre-sampled + bool pre_sampled_ = false; + // Used internally to keep track of thread local and global callbacks // that were picked to run; must be sorted; CallbackHandles sorted_active_tls_handles_; @@ -332,7 +338,7 @@ class TORCH_API RecordFunctionCallback { } RecordFunctionCallback& samplingProb(double sampling_prob) { - TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob_ <= 1.0, + TORCH_CHECK(sampling_prob >= 0.0 && sampling_prob <= 1.0, "Invalid sampling probability"); sampling_prob_ = sampling_prob; return *this; @@ -546,10 +552,27 @@ struct TORCH_API RecordFunctionTLS { RecordFunctionCallbacks sorted_tls_callbacks_; bool tls_record_function_enabled_ = true; + + // Stores the number of coin flips before the next successful coin flip + int tries_left_ = 0; }; TORCH_API const RecordFunctionTLS& get_record_function_tls_(); TORCH_API void set_record_function_tls_(const RecordFunctionTLS& tls); +// Checks whether RecordFunction should be called, +// sets boolean pointed by the argument to whether pre-sampling was used +TORCH_API bool shouldRunRecordFunction(bool*); + +// The following functions are used to disable/enable pre-sampling of RecordFunction +// when high-frequency/non-sampled callbacks are added/removed. +// Note: every call to bumpRecordAllFunctions() is supposed to be matched with +// the corresponding releaseRecordAllFunctions() call. +// Note: disabling pre-sampling of RecordFunction incurs an extra overhead, since +// RecordFunction will be created for each operator call. +TORCH_API void bumpRecordAllFunctions(); +TORCH_API void releaseRecordAllFunctions(); +TORCH_API bool checkRecordAllFunctions(); + } // namespace at diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index d924003b9270..53a8bd16f43d 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -7,61 +7,55 @@ #include #include -C10_DEFINE_int(iter, 100, "Number of iterations"); -C10_DEFINE_int(warmup_iter, 10, "Number of warmup iterations"); +C10_DEFINE_int(iter, 10000, "Number of iterations"); C10_DEFINE_int(sampled_iter, 10e6, "Number of iterations for the sampled observer benchmark"); namespace { -const int kInnerIter = 100; -const int kNumSampledCb = 2; const int kTensorSize = 16; const int kSmallTensorSize = 1; -const float kSampingProb = 0.1; - const float kLowSamplingProb = 0.0001; } -void setupBenchmarkCallbacks() { - at::enableRecordFunction(); - at::clearCallbacks(); - // non-sampled callback - at::addGlobalCallback(at::RecordFunctionCallback( - [&](const at::RecordFunction& fn) {}, +void addTestCallback( + double sampling_prob = 1.0, + std::function fn = + [](const at::RecordFunction&) {}) { + auto cb = at::RecordFunctionCallback( + std::move(fn), [](const at::RecordFunction&) {}) - .needsInputs(true)); - - // sampled - for (auto idx = 0; idx < kNumSampledCb; ++idx) { - at::addGlobalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) {}, - [](const at::RecordFunction&) {}) - .needsInputs(true) - .samplingProb(kSampingProb) - ); + .needsInputs(false); + if (sampling_prob < 1.0) { + cb.samplingProb(sampling_prob); } + at::addGlobalCallback(cb); } -float runTensorBench(int tensor_size, int outer_iter) { +float runTensorGEMMBench(int tensor_size, int iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); - for (auto idx = 0; idx < kInnerIter * outer_iter; ++idx) { - torch::mm( - torch::randn({tensor_size, tensor_size}), - torch::randn({tensor_size, tensor_size})); + auto inp = torch::randn({tensor_size, tensor_size}); + for (auto idx = 0; idx < iter; ++idx) { + torch::mm(inp, inp); } auto duration = static_cast( std::chrono::duration_cast(clock::now() - start_time).count()); return duration; } -float runPureRecordFunctionBench(int outer_iter) { +float runPureRecordFunctionBench(int iter) { typedef std::chrono::high_resolution_clock clock; typedef std::chrono::microseconds us; std::chrono::time_point start_time = clock::now(); - for (auto n = 0; n < outer_iter; ++n) { - RECORD_USER_SCOPE("test"); + for (auto idx = 0; idx < iter; ++idx) { + bool pre_sampled = false; + if (at::shouldRunRecordFunction(&pre_sampled)) { + at::RecordFunction guard(at::RecordScope::USER_SCOPE, pre_sampled); + if (C10_UNLIKELY(guard.isActive())) { + guard.before("Test", -1); + } + } } auto duration = static_cast( std::chrono::duration_cast(clock::now() - start_time).count()); @@ -71,18 +65,19 @@ float runPureRecordFunctionBench(int outer_iter) { void runBenchmark() { float duration = 0; for (auto tensor_size : std::set({kSmallTensorSize, kTensorSize})) { - duration = runTensorBench(tensor_size, FLAGS_iter); - std::cout << "Running tensor benchmark, time per iteration (" + duration = runTensorGEMMBench(tensor_size, FLAGS_iter); + std::cout << "Tensor GEMM benchmark (" << tensor_size << "x" << tensor_size - << "): " << (duration/FLAGS_iter) + << ", " << FLAGS_iter << "): " << duration << " us." << std::endl; } - duration = runPureRecordFunctionBench(FLAGS_iter * 100); - std::cout << "Running pure RecordFunction benchmark, time per iteration: " - << (duration/FLAGS_iter) - << " us." << std::endl; + duration = runPureRecordFunctionBench(FLAGS_iter); + std::cout << "Pure RecordFunction benchmark (" + << FLAGS_iter << "): " + << duration + << " us." << std::endl; } int main(int argc, char** argv) { @@ -91,32 +86,38 @@ int main(int argc, char** argv) { return -1; } - auto duration = runTensorBench(kSmallTensorSize, FLAGS_warmup_iter); - std::cout << "Warmup time: " << duration << " us." << std::endl; + at::enableRecordFunction(); + at::clearCallbacks(); - setupBenchmarkCallbacks(); - std::cout << "Running with empty observers" << std::endl; + std::cout << "Warm up" << std::endl; runBenchmark(); - at::clearCallbacks(); std::cout << "Running without observers" << std::endl; runBenchmark(); - std::cout << "Running sampled observer benchmark" << std::endl; + addTestCallback(); + std::cout << "Running with empty non-sampled observer" << std::endl; + runBenchmark(); + at::clearCallbacks(); + + addTestCallback(kLowSamplingProb); + std::cout << "Running with empty sampled observer" << std::endl; + runBenchmark(); + at::clearCallbacks(); + + std::cout << "Checking number of sampled observer invocations" << std::endl; int cb_count = 0; - at::addGlobalCallback(at::RecordFunctionCallback( + addTestCallback( + kLowSamplingProb, [&](const at::RecordFunction& fn) { ++cb_count; - }, - [](const at::RecordFunction&) {}) - .needsInputs(true) - .samplingProb(kLowSamplingProb) + } ); - runPureRecordFunctionBench(FLAGS_sampled_iter); + auto duration = runPureRecordFunctionBench(FLAGS_sampled_iter); std::cout << "Pure RecordFunction runtime of " << FLAGS_sampled_iter - << " iterations " << duration + << " iterations: " << duration << " us, number of callback invocations: " << cb_count << ", expected number: ~" << (int)(FLAGS_sampled_iter * kLowSamplingProb) << " invocations" << std::endl; diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 09dc048f214b..44171e1a3b1b 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -133,26 +133,33 @@ struct TORCH_API Node : std::enable_shared_from_this { /// Evaluates the function on the given inputs and returns the result of the /// function call. variable_list operator()(variable_list&& inputs) { - // Using RecordFunction to trogger observers in the backward pass - at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION); - if (guard.isActive()) { - // Using sequence number and thread id to correlate with - // the forward pass function - guard.setForwardThreadId(thread_id_); - if (guard.needsInputs()) { - guard.before( - name(), - std::vector(inputs.begin(), inputs.end()), - sequence_nr()); - } else { - guard.before(name(), sequence_nr()); - } - } // In the first iteration of named tensors, autograd ignores names and // operates on unnamed tensors. In the long term, autograd should // probably operate with names. at::NoNamesGuard no_names_guard; - return apply(std::move(inputs)); + + bool pre_sampled = false; + if (at::shouldRunRecordFunction(&pre_sampled)) { + // Using RecordFunction to trogger observers in the backward pass + at::RecordFunction guard(at::RecordScope::BACKWARD_FUNCTION, pre_sampled); + if (guard.isActive()) { + // Using sequence number and thread id to correlate with + // the forward pass function + guard.setForwardThreadId(thread_id_); + if (guard.needsInputs()) { + guard.before( + name(), + std::vector(inputs.begin(), inputs.end()), + sequence_nr()); + } else { + guard.before(name(), sequence_nr()); + } + } + // keeping stack guard object alive during the call + return apply(std::move(inputs)); + } else { + return apply(std::move(inputs)); + } } // Graph Connectivity API diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 3a028175d9c3..5d88264a2f2c 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1609,10 +1609,11 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { } static void checkAndStartRecordFunction(Frame& frame, Stack& stack) { + bool pre_sampled = false; if (!frame.record_function && at::hasCallbacks() && - at::isRecordFunctionEnabled()) { + at::shouldRunRecordFunction(&pre_sampled)) { auto rec_fn = std::make_unique( - at::RecordScope::TORCHSCRIPT_FUNCTION); + at::RecordScope::TORCHSCRIPT_FUNCTION, pre_sampled); if (rec_fn->isActive()) { if (rec_fn->needsInputs()) { rec_fn->before( From 9920adebfd2ff2eda33f72f2d4589973f1581b76 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Fri, 11 Dec 2020 13:24:55 -0800 Subject: [PATCH 184/250] pyi cleanup (#49054) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49054 These are some followups from the first pyi codegen PR. Still maintaining byte-for-byte compatibility in this one. - Separated `argument_str() with a pyi flag into two functions, `argument_str()` and `argument_str_pyi()` - Added a notes section for pyi at the top of `python.py` - Added a `Python Interface` section that I moved the free-standing pyi functions to Test Plan: Imported from OSS Reviewed By: ljk53 Differential Revision: D25410848 Pulled By: bdhirsh fbshipit-source-id: db83a80af900c32b5e32d67ce27767f6e7c2adfb --- .jenkins/pytorch/codegen-test.sh | 1 - tools/autograd/gen_python_functions.py | 2 - tools/codegen/api/python.py | 261 ++++++++++++++----------- 3 files changed, 143 insertions(+), 121 deletions(-) diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh index 44f1e9449bf0..17e7e9fa3445 100755 --- a/.jenkins/pytorch/codegen-test.sh +++ b/.jenkins/pytorch/codegen-test.sh @@ -37,7 +37,6 @@ python -m tools.setup_helpers.generate_code \ mkdir -p "$OUT"/pyi/torch/_C mkdir -p "$OUT"/pyi/torch/nn python -m tools.pyi.gen_pyi \ - --declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \ --native-functions-path aten/src/ATen/native/native_functions.yaml \ --deprecated-functions-path tools/autograd/deprecated.yaml \ --out "$OUT"/pyi diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 63438a527b4c..47abce5466c6 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -228,8 +228,6 @@ def signature_original(f: NativeFunction) -> str: opname = str(f.func.name.name.base) if f.func.is_out_fn(): opname += '_out' - # TODO: remove HACK - # I think we want to differentiate inplace functions here.. but we currently don't for the arg parser if f.func.name.name.inplace and pyi: opname += '_' args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments() diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 45fa1685a5cf..dadfed354106 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -13,6 +13,8 @@ # # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # +# [Notes] python binding codegen +# # The Python binding codegen produces code that takes the input list of # PyObjects, finds the matching ATen C++ function using PythonArgParser, # converts the PyObjects into C++ types and calls the ATen C++ function: @@ -171,25 +173,15 @@ # return wrap(dispatch_abs_out(_r.tensor(1), _r.tensor(0))); # } # - -# TODO: stick this more firmly in the data model somewhere? -def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]: - if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)): - return [] - else: - if any(map(lambda r: r.name is None, returns)): - # When building on Windows, `PyStructSequence_UnnamedField` could not be - # resolved by the linker for some reason, which cause error in building: - # - # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol - # PyStructSequence_UnnamedField - # - # Thus, at this point in time, we do not support unnamed - # fields in namedtuple; you must either name all fields, - # or none of them. - raise ValueError("Unnamed field is not supported by codegen") - - return list(map(lambda r: str(r.name), returns)) +# +# [Notes] python interface codegen +# The python dataclasses below are used used to generate both python binding code +# and pyi type hint signatures. +# In theory these two should look very similar, but there are number of differences +# in how pyi signatures vs. python_arg_parser signatures are generated. +# These differences have been encapsulated in signature_str() vs. signature_str_pyi() +# to display the full signatures, and argument_str() vs argument_str_pyi() to display arguments. +# For examples, only pyi signatures include return types. @dataclass(frozen=True) class PythonReturns: @@ -235,9 +227,30 @@ class PythonArgument: # Compute argument formal for python argument parsing. # Needs to be consistent with torch/csrc/utils/python_arg_parser.h. - def argument_str(self, *, method: bool = False, pyi: bool = False, deprecated: bool = False) -> str: - type_str = argument_type_str_pyi(self.type, pyi_out_arg=pyi and isinstance(self, PythonOutArgument)) \ - if pyi else argument_type_str(self.type) + def argument_str(self, *, method: bool = False) -> str: + type_str = argument_type_str(self.type) + + name = self.name + # s/self/input/ outside method bindings + # [old codegen] TODO: remove this? doesn't rename in codegen, it's just + # for the parse string + if name == 'self' and type_str == 'Tensor' and not method: + name = 'input' + + # add default + if self.default is not None: + default = { + 'nullptr': 'None', + 'c10::nullopt': 'None', + '{}': 'None', + }.get(self.default, self.default) + return f'{type_str} {name}={default}' + else: + return f'{type_str} {name}' + + def argument_str_pyi(self, *, method: bool = False, deprecated: bool = False) -> str: + is_out_arg = isinstance(self, PythonOutArgument) + type_str = argument_type_str_pyi(self.type, pyi_out_arg=is_out_arg) name = self.name # s/self/input/ outside method bindings @@ -246,45 +259,33 @@ def argument_str(self, *, method: bool = False, pyi: bool = False, deprecated: b if name == 'self' and type_str == 'Tensor' and not method and not deprecated: name = 'input' - if pyi: - if name == 'from': # from is a Python keyword... - name += '_' - # pyi merges the _out and functional variants into the same signature, with an optional out arg - if name == 'out' and type_str == 'Tensor' and not deprecated: - type_str = 'Optional[' + type_str + ']' + if name == 'from': # from is a Python keyword... + name += '_' + # pyi merges the _out and functional variants into the same signature, with an optional out arg + if name == 'out' and type_str == 'Tensor' and not deprecated: + type_str = 'Optional[' + type_str + ']' # TODO: remove diff. pyi deprecated signatures don't get defaults for their out arg - treat_as_no_default = pyi and deprecated and isinstance(self, PythonOutArgument) and self.default == 'None' + treat_as_no_default = deprecated and is_out_arg and self.default == 'None' # add default if self.default is not None and not treat_as_no_default: - if pyi: - if isinstance(self.type, ListType) and self.type.elem == BaseType(BaseTy.int) and \ - self.default.startswith('{') and self.default.endswith('}'): - default = '(' + self.default[1:-1] + ')' - else: - default = { - 'nullptr': 'None', - 'c10::nullopt': 'None', - '{}': 'None', - 'MemoryFormat::Contiguous': 'contiguous_format', - 'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine', - }.get(self.default, self.default) - # TODO: remove requires_grad special case (byte-for-byte compat) - return f'{name}:{type_str}={default}' if name == 'requires_grad' else f'{name}: {type_str}={default}' + if isinstance(self.type, ListType) and self.type.elem == BaseType(BaseTy.int) and \ + self.default.startswith('{') and self.default.endswith('}'): + default = '(' + self.default[1:-1] + ')' else: default = { 'nullptr': 'None', 'c10::nullopt': 'None', '{}': 'None', + 'MemoryFormat::Contiguous': 'contiguous_format', + 'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine', }.get(self.default, self.default) - return f'{type_str} {name}={default}' + # TODO: remove requires_grad special case (byte-for-byte compat) + return f'{name}:{type_str}={default}' if name == 'requires_grad' else f'{name}: {type_str}={default}' else: - if pyi: - # TODO: remove requires_grad special case (byte-for-byte compat) - return f'{name}:{type_str}' if name == 'requires_grad' else f'{name}: {type_str}' - else: - return f'{type_str} {name}' + # TODO: remove requires_grad special case (byte-for-byte compat) + return f'{name}:{type_str}' if name == 'requires_grad' else f'{name}: {type_str}' @dataclass(frozen=True) class PythonOutArgument(PythonArgument): @@ -391,8 +392,7 @@ def output_idx(self) -> int: # for error parsing. # # For a translation to mypy-valid type signatures, see - # signature_str_pyi. If you change any logic here, please - # check that file too. + # signature_str_pyi(). def signature_str(self, *, skip_outputs: bool = False) -> str: args = self.arguments(skip_outputs=skip_outputs) schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method), args)) @@ -404,7 +404,7 @@ def signature_str(self, *, skip_outputs: bool = False) -> str: def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str: args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) - schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True), args)) + schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args)) positional_argc = len(self.input_args) if len(schema_formals) > positional_argc: schema_formals.insert(positional_argc, '*') @@ -419,7 +419,7 @@ def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: boo def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]: # only pyi uses vararg signatures args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) - schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True), args)) + schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args)) # vararg only applies to pyi signatures. vararg variants are not generated for all signatures num_args = self.arguments_count() num_positionalargs = len(self.input_args) @@ -471,7 +471,7 @@ def signature_str(self, *, skip_outputs: bool = False) -> str: def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str: args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) - schema_formals: List[str] = list(map(lambda a: a.argument_str(method=self.method, pyi=True, deprecated=True), args)) + schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method, deprecated=True), args)) positional_argc = len(self.input_args) if len(schema_formals) > positional_argc: schema_formals.insert(positional_argc, '*') @@ -662,67 +662,6 @@ def argument(a: Argument) -> PythonArgument: default_init=None, ) -def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str: - add_optional = False - if isinstance(t, OptionalType): - t = t.elem - add_optional = True - - if isinstance(t, BaseType): - if t.name == BaseTy.int: - ret = '_int' - elif t.name == BaseTy.float: - ret = '_float' - elif t.name == BaseTy.str: - ret = 'str' - elif t.name == BaseTy.Scalar: - ret = 'Number' - elif t.name == BaseTy.ScalarType: - ret = '_dtype' - elif t.name == BaseTy.bool: - ret = '_bool' - elif t.name == BaseTy.QScheme: - ret = '_qscheme' - elif t.name == BaseTy.Layout: - ret = '_layout' - elif t.name == BaseTy.Device: - ret = 'Union[_device, str, None]' - elif t.name == BaseTy.MemoryFormat: - ret = 'memory_format' - elif t.name == BaseTy.Dimname: - ret = 'Union[str, ellipsis, None]' - elif t.name in [BaseTy.Tensor, BaseTy.Generator, - BaseTy.Storage, BaseTy.Stream, BaseTy.str]: - # These python schema type names line up with their function schema names - ret = t.name.name - - elif isinstance(t, ListType): - if pyi_out_arg and t.is_tensor_like(): - # TODO remove HACK - # pyi blindly treats all tensor-like out args as having type Tensor - return 'Tensor' - if str(t.elem) == 'int': - ret = 'Union[_int, _size]' if t.size is not None else '_size' - elif t.is_tensor_like(): - # TODO: this doesn't seem right... - # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]] - # It should probably translate to Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]] - if isinstance(t.elem, OptionalType): - add_optional = True - ret = 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]' if t.size is not None else \ - 'Union[Tuple[Tensor, ...], List[Tensor]]' - elif str(t.elem) == 'float': - ret = 'Sequence[float]' - else: - elem = argument_type_str_pyi(t.elem) - ret = f'Sequence[{elem}]' - - if add_optional: - ret = 'Optional[' + ret + ']' - return ret - - raise RuntimeError(f'unrecognized type {repr(t)}') - # Generates a PythonSignature that can be used for either .pyi or PythonArgParser codegen def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> PythonSignature: args: List[Argument] = [] @@ -770,7 +709,7 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> tensor_options_args.append(PythonArgument( name='dtype', type=BaseType(BaseTy.ScalarType), - default=_dtype_default_type_hack(name, pyi=pyi), + default='None' if pyi else _dtype_default_type_hack(name), default_init='self.scalar_type()' if is_like_or_new_function else None, )) # TODO: probably a bug, kill this diff? @@ -816,12 +755,98 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> ) # TODO blowtorch -def _dtype_default_type_hack(name: str, *, pyi: bool) -> str: - if not pyi and (name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices'): +def _dtype_default_type_hack(name: str) -> str: + if name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices': return 'torch.int64' else: return 'None' +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # +# +# Python Interface +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # + +def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]: + if len(returns) <= 1 or all(map(lambda r: r.name is None, returns)): + return [] + else: + if any(map(lambda r: r.name is None, returns)): + # When building on Windows, `PyStructSequence_UnnamedField` could not be + # resolved by the linker for some reason, which cause error in building: + # + # python_nn_functions.cpp.obj : error LNK2001: unresolved external symbol + # PyStructSequence_UnnamedField + # + # Thus, at this point in time, we do not support unnamed + # fields in namedtuple; you must either name all fields, + # or none of them. + raise ValueError("Unnamed field is not supported by codegen") + + return list(map(lambda r: str(r.name), returns)) + +def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str: + add_optional = False + if isinstance(t, OptionalType): + t = t.elem + add_optional = True + + if isinstance(t, BaseType): + if t.name == BaseTy.int: + ret = '_int' + elif t.name == BaseTy.float: + ret = '_float' + elif t.name == BaseTy.str: + ret = 'str' + elif t.name == BaseTy.Scalar: + ret = 'Number' + elif t.name == BaseTy.ScalarType: + ret = '_dtype' + elif t.name == BaseTy.bool: + ret = '_bool' + elif t.name == BaseTy.QScheme: + ret = '_qscheme' + elif t.name == BaseTy.Layout: + ret = '_layout' + elif t.name == BaseTy.Device: + ret = 'Union[_device, str, None]' + elif t.name == BaseTy.MemoryFormat: + ret = 'memory_format' + elif t.name == BaseTy.Dimname: + ret = 'Union[str, ellipsis, None]' + elif t.name in [BaseTy.Tensor, BaseTy.Generator, + BaseTy.Storage, BaseTy.Stream, BaseTy.str]: + # These python schema type names line up with their function schema names + ret = t.name.name + + elif isinstance(t, ListType): + if pyi_out_arg and t.is_tensor_like(): + # TODO remove HACK + # pyi blindly treats all tensor-like out args as having type Tensor + return 'Tensor' + if str(t.elem) == 'int': + ret = 'Union[_int, _size]' if t.size is not None else '_size' + elif t.is_tensor_like(): + # TODO: this doesn't seem right... + # Tensor?[] currently translates to Optional[Union[Tuple[Tensor, ...], List[Tensor]]] + # It should probably translate to Union[Tuple[Optional[Tensor], ...], List[Optional[Tensor]]] + if isinstance(t.elem, OptionalType): + add_optional = True + ret = 'Union[Tensor, Tuple[Tensor, ...], List[Tensor]]' if t.size is not None else \ + 'Union[Tuple[Tensor, ...], List[Tensor]]' + elif str(t.elem) == 'float': + ret = 'Sequence[float]' + else: + elem = argument_type_str_pyi(t.elem) + ret = f'Sequence[{elem}]' + + if add_optional: + ret = 'Optional[' + ret + ']' + return ret + + raise RuntimeError(f'unrecognized type {repr(t)}') + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # C++ Function Dispatch From b94ec8c9f71b461d849b95f07c9ce8c31a366bbf Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Fri, 11 Dec 2020 13:24:55 -0800 Subject: [PATCH 185/250] pyi codegen - removing byte-for-byte compatibility hacks (#49055) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49055 Removed the majority of the TODO hacks that I added to the original pyi PR to maintain byte-for-byte compatibility. I left a few of the divergences between pyi deprecated vs. native signatures, since (a) they're smaller and (b) it might make more sense to kill the deprecated functions at some point entirely. Test Plan: Imported from OSS Reviewed By: ljk53 Differential Revision: D25410847 Pulled By: bdhirsh fbshipit-source-id: cf07cdda92f7492cd83d363cbb810e3810f6b8c8 --- tools/codegen/api/python.py | 68 +++++++++++------------------- tools/pyi/gen_pyi.py | 12 +----- torch/_C/_VariableFunctions.pyi.in | 2 +- 3 files changed, 27 insertions(+), 55 deletions(-) diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index dadfed354106..10483e2e3d76 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -249,8 +249,7 @@ def argument_str(self, *, method: bool = False) -> str: return f'{type_str} {name}' def argument_str_pyi(self, *, method: bool = False, deprecated: bool = False) -> str: - is_out_arg = isinstance(self, PythonOutArgument) - type_str = argument_type_str_pyi(self.type, pyi_out_arg=is_out_arg) + type_str = argument_type_str_pyi(self.type) name = self.name # s/self/input/ outside method bindings @@ -261,12 +260,13 @@ def argument_str_pyi(self, *, method: bool = False, deprecated: bool = False) -> if name == 'from': # from is a Python keyword... name += '_' + # pyi merges the _out and functional variants into the same signature, with an optional out arg if name == 'out' and type_str == 'Tensor' and not deprecated: type_str = 'Optional[' + type_str + ']' - # TODO: remove diff. pyi deprecated signatures don't get defaults for their out arg - treat_as_no_default = deprecated and is_out_arg and self.default == 'None' + # pyi deprecated signatures don't get defaults for their out arg + treat_as_no_default = deprecated and isinstance(self, PythonOutArgument) and self.default == 'None' # add default if self.default is not None and not treat_as_no_default: @@ -281,11 +281,9 @@ def argument_str_pyi(self, *, method: bool = False, deprecated: bool = False) -> 'MemoryFormat::Contiguous': 'contiguous_format', 'QScheme::PER_TENSOR_AFFINE': 'per_tensor_affine', }.get(self.default, self.default) - # TODO: remove requires_grad special case (byte-for-byte compat) - return f'{name}:{type_str}={default}' if name == 'requires_grad' else f'{name}: {type_str}={default}' + return f'{name}: {type_str}={default}' else: - # TODO: remove requires_grad special case (byte-for-byte compat) - return f'{name}:{type_str}' if name == 'requires_grad' else f'{name}: {type_str}' + return f'{name}: {type_str}' @dataclass(frozen=True) class PythonOutArgument(PythonArgument): @@ -357,23 +355,13 @@ def deprecated(self) -> bool: return False def arguments( - self, *, skip_outputs: bool = False, skip_tensor_options: bool = False, hacky_add_output: bool = False + self, *, skip_outputs: bool = False, skip_tensor_options: bool = False ) -> Tuple[Union[PythonArgument, PythonOutArgument], ...]: result: List[Union[PythonArgument, PythonOutArgument]] = [] result.extend(self.input_args) result.extend(self.input_kwargs) if self.output_args is not None and not skip_outputs: result.append(self.output_args) - # TODO: remove HACK - # in the existing pyi codegen, we tack on an optional out argument to every operator overload - # if there exists at least one overload with an out variant. This seems wrong. - elif hacky_add_output: - result.extend([PythonOutArgument( - name='out', - type=OptionalType(BaseType(BaseTy.Tensor)), - default='None', - default_init=None, - outputs=())]) if not skip_tensor_options: result.extend(self.tensor_options_args) return tuple(result) @@ -402,8 +390,8 @@ def signature_str(self, *, skip_outputs: bool = False) -> str: return f'{self.name}({", ".join(schema_formals)})' - def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str: - args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) + def signature_str_pyi(self, *, skip_outputs: bool = False) -> str: + args = self.arguments(skip_outputs=skip_outputs) schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args)) positional_argc = len(self.input_args) if len(schema_formals) > positional_argc: @@ -416,9 +404,9 @@ def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: boo schema_formals.insert(0, "self") return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' - def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]: + def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]: # only pyi uses vararg signatures - args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) + args = self.arguments(skip_outputs=skip_outputs) schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method), args)) # vararg only applies to pyi signatures. vararg variants are not generated for all signatures num_args = self.arguments_count() @@ -469,8 +457,8 @@ def deprecated(self) -> bool: def signature_str(self, *, skip_outputs: bool = False) -> str: return PythonSignature.signature_str(self, skip_outputs=skip_outputs) + '|deprecated' - def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> str: - args = self.arguments(skip_outputs=skip_outputs, hacky_add_output=hacky_add_output) + def signature_str_pyi(self, *, skip_outputs: bool = False) -> str: + args = self.arguments(skip_outputs=skip_outputs) schema_formals: List[str] = list(map(lambda a: a.argument_str_pyi(method=self.method, deprecated=True), args)) positional_argc = len(self.input_args) if len(schema_formals) > positional_argc: @@ -479,7 +467,7 @@ def signature_str_pyi(self, *, skip_outputs: bool = False, hacky_add_output: boo returns_str = self.returns.returns_str_pyi() return f'def {self.name}({", ".join(schema_formals)}) -> {returns_str}: ...' - def signature_str_pyi_vararg(self, *, skip_outputs: bool = False, hacky_add_output: bool = False) -> Optional[str]: + def signature_str_pyi_vararg(self, *, skip_outputs: bool = False) -> Optional[str]: # the codegen doesn't include vararg variants for deprecated signatures return None @@ -712,11 +700,9 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> default='None' if pyi else _dtype_default_type_hack(name), default_init='self.scalar_type()' if is_like_or_new_function else None, )) - # TODO: probably a bug, kill this diff? - # pyi signatures have a slightly different type/default for layout tensor_options_args.append(PythonArgument( name='layout', - type=BaseType(BaseTy.Layout) if pyi else OptionalType(BaseType(BaseTy.Layout)), + type=OptionalType(BaseType(BaseTy.Layout)), default='strided' if pyi else 'torch.strided', default_init='layout_from_backend(self.options().backend())' if is_like_or_new_function else None, )) @@ -726,15 +712,12 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> default='None', default_init='self.device()' if is_like_or_new_function else None, )) - # TODO: probably a bug, kill this diff? - # pyi signatures don't include pin memory - if not pyi: - tensor_options_args.append(PythonArgument( - name='pin_memory', - type=BaseType(BaseTy.bool), - default='False', - default_init=None, - )) + tensor_options_args.append(PythonArgument( + name='pin_memory', + type=BaseType(BaseTy.bool), + default='False', + default_init=None, + )) tensor_options_args.append(PythonArgument( name='requires_grad', type=BaseType(BaseTy.bool), @@ -755,12 +738,13 @@ def signature(f: NativeFunction, *, method: bool = False, pyi: bool = False) -> ) # TODO blowtorch +# note: removing this will be BC-breaking. A quick test shows that +# randperm will otherwise default its dtype to torch.float64 def _dtype_default_type_hack(name: str) -> str: if name.startswith('randperm') or name == 'tril_indices' or name == 'triu_indices': return 'torch.int64' else: return 'None' - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # # Python Interface @@ -785,7 +769,7 @@ def namedtuple_fieldnames(returns: Tuple[Return, ...]) -> List[str]: return list(map(lambda r: str(r.name), returns)) -def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str: +def argument_type_str_pyi(t: Type) -> str: add_optional = False if isinstance(t, OptionalType): t = t.elem @@ -820,10 +804,6 @@ def argument_type_str_pyi(t: Type, *, pyi_out_arg: bool = False) -> str: ret = t.name.name elif isinstance(t, ListType): - if pyi_out_arg and t.is_tensor_like(): - # TODO remove HACK - # pyi blindly treats all tensor-like out args as having type Tensor - return 'Tensor' if str(t.elem) == 'int': ret = 'Union[_int, _size]' if t.size is not None else '_size' elif t.is_tensor_like(): diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index ee5c38a4cf1c..9a3a0f520e54 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -213,23 +213,15 @@ def generate_type_hints(funcs: Sequence[PythonSignatureGroup], is_tensor: bool = type_hint = sig_group.signature.signature_str_pyi(skip_outputs=True) type_hints.append(type_hint) - # TODO: remove HACK - # the pyi codegen currently adds an optional out param in cases where the current op does NOT have an out variant, - # but an overload of the op DOES have an out variant. - # TODO: After that, we should consider killing this method entirely and operating per PythonSignatureGroup - # rather than grouping their overloads together - # (since there isn't much else semantically meaningful about grouping overloads) - # this hack also doesn't apply to deprecated ops - hacky_add_output = any_out and sig_group.outplace is None and not sig_group.signature.deprecated # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument # Generates the out variant if one exists. Otherwise, generate the functional variant type_hint = sig_group.signature.signature_str_pyi( - skip_outputs=sig_group.outplace is None, hacky_add_output=hacky_add_output) + skip_outputs=sig_group.outplace is None) type_hints.append(type_hint) # Some operators also additionally have a vararg variant of their signature type_hint_vararg = sig_group.signature.signature_str_pyi_vararg( - skip_outputs=sig_group.outplace is None, hacky_add_output=hacky_add_output) + skip_outputs=sig_group.outplace is None) if type_hint_vararg: type_hints.append(type_hint_vararg) diff --git a/torch/_C/_VariableFunctions.pyi.in b/torch/_C/_VariableFunctions.pyi.in index 1360ef079725..1afd8e6c73d7 100644 --- a/torch/_C/_VariableFunctions.pyi.in +++ b/torch/_C/_VariableFunctions.pyi.in @@ -1,6 +1,6 @@ # ${generated_comment} -from torch import Tensor, Generator, strided, memory_format, contiguous_format +from torch import Tensor, Generator, strided, memory_format, contiguous_format, strided from typing import List, Tuple, Optional, Union, Any, ContextManager, Callable, overload, Iterator, NamedTuple, Sequence, TypeVar from torch._six import inf From 33a9b14da04bfd990fd454ce3dc6eaa1668f2159 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Fri, 11 Dec 2020 13:24:55 -0800 Subject: [PATCH 186/250] pyi codegen - removing byte-for-byte-compatibility hacks (sorting overloads) (#49056) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49056 This is another byte-for-byte compatibility hack. I'm now sorting pyi signature overloads (previously the codegen did not). Mostly put this in a separate PR just to more easily reason about the diff in the codegen output. Test Plan: Imported from OSS Reviewed By: ljk53 Differential Revision: D25410846 Pulled By: bdhirsh fbshipit-source-id: 06e5c32edbce610dd12ec7499014b41b23c646bd --- tools/autograd/gen_python_functions.py | 6 +----- tools/pyi/gen_pyi.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py index 47abce5466c6..570c99908853 100644 --- a/tools/autograd/gen_python_functions.py +++ b/tools/autograd/gen_python_functions.py @@ -637,8 +637,6 @@ def method_def( def group_overloads( overloads: Sequence[PythonSignatureNativeFunctionPair], - *, - sort: bool = True, ) -> Sequence[PythonSignatureGroup]: bases: Dict[str, PythonSignatureNativeFunctionPair] = {} outplaces: Dict[str, PythonSignatureNativeFunctionPair] = {} @@ -687,9 +685,7 @@ def group_overloads( outplace=outplace.function if outplace is not None else None, )) - # TODO: unconditionally sort - # maintaining byte-for-byte compatibility for pyi codegen for now - return grouped if not sort else sort_overloads(grouped) + return sort_overloads(grouped) # This function declares a partial order on declarations, and sorts them according # to its linear extension. This is necessary, because there's some ambiguity in the diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 9a3a0f520e54..21f965cb101b 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -39,7 +39,7 @@ # TODO: consider waiting to group by base name until we actually need to # (after computing type hint signatures, when adding @overload directives) def group_by_base_name(python_funcs: Sequence[PythonSignatureNativeFunctionPair]) -> Mapping[str, List[PythonSignatureGroup]]: - groups = group_overloads(python_funcs, sort=False) + groups = group_overloads(python_funcs) d = collections.defaultdict(list) for g in groups: name = g.signature.name From 218eaf4bbafef23600e6c9e668b7a49633639734 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Fri, 11 Dec 2020 13:24:55 -0800 Subject: [PATCH 187/250] pyi codegen refactor - no need to group python signatures by overload name (#49057) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49057 Now that all of the byte-for-byte hacks are removed in the pyi codegen, there's no reason for the codegen to group pyi signature overloads together. I updated the logic in `gen_pyi` that computes signatures (`generate_type_hints()` and _generate_named_tuples()`) to operate per individual `PythonSignatureGroup` Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D25410849 Pulled By: bdhirsh fbshipit-source-id: 8c190035d7bfc06ed192468efbe7d902922ad1fa --- tools/pyi/gen_pyi.py | 113 ++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 65 deletions(-) diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 21f965cb101b..dad150fa0ad5 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -7,7 +7,7 @@ from tools.codegen.model import * from tools.codegen.api.python import * -from typing import Sequence, List, Mapping, Dict +from typing import Sequence, List, Dict from ..autograd.utils import CodeTemplate, write from ..autograd.gen_python_functions import should_generate_py_binding, load_signatures, group_overloads @@ -36,20 +36,10 @@ read gen_pyi for the gory details. """ -# TODO: consider waiting to group by base name until we actually need to -# (after computing type hint signatures, when adding @overload directives) -def group_by_base_name(python_funcs: Sequence[PythonSignatureNativeFunctionPair]) -> Mapping[str, List[PythonSignatureGroup]]: - groups = group_overloads(python_funcs) - d = collections.defaultdict(list) - for g in groups: - name = g.signature.name - d[name].append(g) - return d - def get_py_torch_functions( python_funcs: Sequence[PythonSignatureNativeFunctionPair], method: bool = False, -) -> Mapping[str, Sequence[PythonSignatureGroup]]: +) -> Sequence[PythonSignatureGroup]: """ Get declarations (grouped by name) which should be generated as either functions in the "torch" module or methods on Tensor. @@ -65,7 +55,7 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool: Variant.method in python_func.function.variants) should_bind = should_bind_method if method else should_bind_function - return group_by_base_name([f for f in python_funcs if should_bind(f)]) + return group_overloads([f for f in python_funcs if should_bind(f)]) # TODO: Consider defining some aliases for our Union[...] types, to make @@ -176,54 +166,31 @@ def sig_for_ops(opname: str) -> List[str]: else: raise Exception("unknown op", opname) -def generate_named_tuples(funcs: Sequence[PythonSignatureGroup]) -> Dict[str, str]: - namedtuples: Dict[str, str] = {} - for sig_group in funcs: - named_tuple = sig_group.signature.returns.named_tuple_pyi() - if named_tuple is not None: - tuple_name, tuple_def = named_tuple - if tuple_name in namedtuples: - assert namedtuples[tuple_name] == tuple_def - else: - namedtuples[tuple_name] = tuple_def - return namedtuples - -def generate_type_hints(funcs: Sequence[PythonSignatureGroup], is_tensor: bool = False) -> List[str]: - """generate_type_hints(funcs, is_tensor=False) +def generate_type_hints(sig_group: PythonSignatureGroup) -> List[str]: + type_hints = [] - Generates type hints for the declarations pertaining to the function - :attr:`funcs` are the func from the parsed native_functions.yaml. - The :attr:`is_tensor` flag indicates whether we are parsing - members of the Tensor class (true) or functions in the - `torch` namespace (default, false). - """ + # Some deprecated ops that are on the blocklist are still included in pyi + if sig_group.signature.name in blocklist and not sig_group.signature.deprecated: + return type_hints - type_hints = [] - any_out = any([g for g in funcs if g.outplace is not None]) - - for sig_group in funcs: - # Some deprecated ops that are on the blocklist are still included in pyi - if sig_group.signature.name in blocklist and not sig_group.signature.deprecated: - continue - - # deprecated signatures have separate entries for their functional and out variants - # (as opposed to the native ops, which fuse the two into a single signature). - # generate the functional variant here, if an out variant exists. - if sig_group.signature.deprecated and sig_group.outplace is not None: - type_hint = sig_group.signature.signature_str_pyi(skip_outputs=True) - type_hints.append(type_hint) - - # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument - # Generates the out variant if one exists. Otherwise, generate the functional variant - type_hint = sig_group.signature.signature_str_pyi( - skip_outputs=sig_group.outplace is None) + # deprecated signatures have separate entries for their functional and out variants + # (as opposed to the native ops, which fuse the two into a single signature). + # generate the functional variant here, if an out variant exists. + if sig_group.signature.deprecated and sig_group.outplace is not None: + type_hint = sig_group.signature.signature_str_pyi(skip_outputs=True) type_hints.append(type_hint) - # Some operators also additionally have a vararg variant of their signature - type_hint_vararg = sig_group.signature.signature_str_pyi_vararg( - skip_outputs=sig_group.outplace is None) - if type_hint_vararg: - type_hints.append(type_hint_vararg) + # PythonSignatureGroups that have both a functional + out variant get a single signature, with an optional out argument + # Generates the out variant if one exists. Otherwise, generate the functional variant + type_hint = sig_group.signature.signature_str_pyi( + skip_outputs=sig_group.outplace is None) + type_hints.append(type_hint) + + # Some operators also additionally have a vararg variant of their signature + type_hint_vararg = sig_group.signature.signature_str_pyi_vararg( + skip_outputs=sig_group.outplace is None) + if type_hint_vararg: + type_hints.append(type_hint_vararg) return type_hints @@ -376,11 +343,18 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, out: str) -> None: function_signatures = load_signatures(native_yaml_path, deprecated_yaml_path, method=False, pyi=True) sig_groups = get_py_torch_functions(function_signatures) - for name in sorted(sig_groups.keys()): - unsorted_function_hints[name] += generate_type_hints(sig_groups[name]) - # deprecated signatures are not used when computing named tuples - native_groups = [g for g in sig_groups[name] if not g.signature.deprecated] - namedtuples.update(generate_named_tuples(native_groups)) + for group in sorted(sig_groups, key=lambda g: g.signature.name): + name = group.signature.name + unsorted_function_hints[name] += generate_type_hints(group) + + named_tuple = group.signature.returns.named_tuple_pyi() + if named_tuple is not None and not group.signature.deprecated: + # deprecated namedtuples are currently not included for torch functions + tuple_name, tuple_def = named_tuple + if tuple_name in namedtuples: + assert namedtuples[tuple_name] == tuple_def + else: + namedtuples[tuple_name] = tuple_def function_hints = [] for name, hints in sorted(unsorted_function_hints.items()): @@ -490,9 +464,18 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, out: str) -> None: tensor_method_signatures = load_signatures(native_yaml_path, deprecated_yaml_path, method=True, skip_deprecated=True, pyi=True) tensor_method_sig_groups = get_py_torch_functions(tensor_method_signatures, method=True) - for name in sorted(tensor_method_sig_groups.keys()): - unsorted_tensor_method_hints[name] += generate_type_hints(tensor_method_sig_groups[name], is_tensor=True) - namedtuples.update(generate_named_tuples(tensor_method_sig_groups[name])) + for group in sorted(tensor_method_sig_groups, key=lambda g: g.signature.name): + name = group.signature.name + unsorted_tensor_method_hints[name] += generate_type_hints(group) + + named_tuple = group.signature.returns.named_tuple_pyi() + if named_tuple is not None and not group.signature.deprecated: + # deprecated namedtuples are currently not included for torch functions + tuple_name, tuple_def = named_tuple + if tuple_name in namedtuples: + assert namedtuples[tuple_name] == tuple_def + else: + namedtuples[tuple_name] = tuple_def for op in all_ops: name = '__{}__'.format(op) From 15200e385a764721000f1dfadbcaf42c328bafdd Mon Sep 17 00:00:00 2001 From: kiyosora Date: Fri, 11 Dec 2020 13:35:14 -0800 Subject: [PATCH 188/250] Enable torch.where() to support Float16 & BFloat16 type inputs (#49004) Summary: Fixed https://github.com/pytorch/pytorch/issues/49075 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49004 Reviewed By: zou3519 Differential Revision: D25495225 Pulled By: H-Huang fbshipit-source-id: 09418ee5503f65c8862e40119c5802779505a4db --- aten/src/ATen/native/cpu/TensorCompareKernel.cpp | 3 ++- aten/src/ATen/native/cuda/TensorCompare.cu | 2 +- test/test_torch.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index b9653c7b25bf..b407eac4d280 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -183,7 +183,8 @@ static void _aminmax_kernel_impl( } static void where_kernel_impl(TensorIterator &iter, ScalarType condition_type) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, iter.dtype(), "where_cpu", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, + iter.dtype(), "where_cpu", [&] { if (condition_type == at::ScalarType::Byte) { cpu_kernel( iter, diff --git a/aten/src/ATen/native/cuda/TensorCompare.cu b/aten/src/ATen/native/cuda/TensorCompare.cu index 443bea3f71ac..b10ae52e44fd 100644 --- a/aten/src/ATen/native/cuda/TensorCompare.cu +++ b/aten/src/ATen/native/cuda/TensorCompare.cu @@ -17,7 +17,7 @@ DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub); namespace { void where_kernel_impl(TensorIterator &iter, ScalarType condition_type) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBool, iter.dtype(), "where_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBFloat16, kBool, iter.dtype(), "where_cuda", [&] { if (condition_type == at::ScalarType::Byte) { gpu_kernel( iter, diff --git a/test/test_torch.py b/test/test_torch.py index 16e011645899..d2566a90f382 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -254,8 +254,8 @@ def get_tensor(size, dtype, device, contiguous): height = 5 width = 5 for device in torch.testing.get_all_device_types(): - for dt1 in torch.testing.get_all_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False): - for dt2 in torch.testing.get_all_dtypes(include_half=device.startswith('cuda'), include_bfloat16=False): + for dt1 in torch.testing.get_all_dtypes(): + for dt2 in torch.testing.get_all_dtypes(): for contiguous in [True, False]: x1 = get_tensor((height, width), dt1, device, contiguous) x2 = get_tensor((height, width), dt2, device, contiguous) @@ -6174,7 +6174,7 @@ def _where_valid_scalar_tensor_combination(self, scalar_type, dtype): return False @onlyOnCPUAndCUDA - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + + @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes())) def test_where_scalar_invalid_combination_raises(self, device, dtype): @@ -6186,7 +6186,7 @@ def checkRaises(scalar_type, dtype, condition, x, scalar_1): self._test_where_scalar_template(device, dtype, checkRaises) - @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes(include_bfloat16=False) + + @dtypes(*(torch.testing.get_all_int_dtypes() + torch.testing.get_all_fp_dtypes() + torch.testing.get_all_complex_dtypes())) def test_where_scalar_valid_combination(self, device, dtype): From 4bc4ec2686b69166f8784ee6d4ba2d1c9e582968 Mon Sep 17 00:00:00 2001 From: Ilia Cherniavskii Date: Fri, 11 Dec 2020 13:48:32 -0800 Subject: [PATCH 189/250] Reduce kineto logging (#49216) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49216 Libkineto is pretty verbose by default, using libkineto api to reduce amount of logging Test Plan: TORCH_CUDA_ARCH_LIST="6.0;7.0" USE_CUDA=1 USE_MKLDNN=1 BUILD_BINARY=1 python setup.py develop install --cmake python test/test_profiler.py Imported from OSS Reviewed By: ngimel Differential Revision: D25488109 fbshipit-source-id: 61b443bcf928db939f730ba32711385bb2b622d4 --- torch/csrc/autograd/profiler_kineto.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 7c91e76490a1..ac6ef84104f3 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -242,6 +242,7 @@ void prepareProfiler( if (!libkineto::api().isProfilerRegistered()) { libkineto_init(); + libkineto::api().suppressLogMessages(); } if (!libkineto::api().isProfilerInitialized()) { From e3542d2c12d8aaaccf8a53873e480c20dc6b7338 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 11 Dec 2020 13:55:01 -0800 Subject: [PATCH 190/250] [PyTorch] avoid unnecessary call to empty_tensor_restride in empty() (#48211) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48211 Our empty benchmark makes this call unconditionally. If MemoryFormat::Contiguous is indeed a common case (or if workloads are likely to use a consistent-ish memory format), then I'd expect checking first to be a win. ghstack-source-id: 118224990 Test Plan: Profiled empty benchmark with perf, saw time spent in empty_tensor_restride go down. Ran framework overhead benchmarks. ~7% win on empty(), 0.5-1.5% regression on InPlace, ~2% win on OutOfPlace. Seems like both the In/Out of place ones are likely to be noise because they don't exercise empty? Reviewed By: bhosmer Differential Revision: D24914706 fbshipit-source-id: 916771b335143f9b4ec9fae0d8118222ab6e8659 --- aten/src/ATen/Utils.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp index a2e5a82c5d06..26fc7dabfd73 100644 --- a/aten/src/ATen/Utils.cpp +++ b/aten/src/ATen/Utils.cpp @@ -57,8 +57,12 @@ Tensor empty_cpu( tensor.unsafeGetTensorImpl()->set_sizes_contiguous(size); } - auto memory_format = memory_format_opt.value_or(MemoryFormat::Contiguous); - tensor.unsafeGetTensorImpl()->empty_tensor_restride(memory_format); + if (memory_format_opt.has_value()) { + // Restriding a just-created empty contiguous tensor does nothing. + if (*memory_format_opt != MemoryFormat::Contiguous) { + tensor.unsafeGetTensorImpl()->empty_tensor_restride(*memory_format_opt); + } + } return tensor; } From 6c1b405a3bc5392081569f1530d74e8459e0e211 Mon Sep 17 00:00:00 2001 From: Ivan Yashchuk Date: Fri, 11 Dec 2020 14:13:05 -0800 Subject: [PATCH 191/250] Updated derivative rules for complex QR decomposition (#48489) Summary: Updated `qr_backward` to work correctly for complex-valued inputs. Added `torch.qr` to list of complex tests. The previous implementation for real-valued differentiation used equation 42 from https://arxiv.org/abs/1001.1654 The current implementation is a bit simpler but the result for the real-valued input case is the same and all tests still pass. Derivation of complex-valued QR differentiation https://giggleliu.github.io/2019/04/02/einsumbp.html Ref. https://github.com/pytorch/pytorch/issues/33152 Pull Request resolved: https://github.com/pytorch/pytorch/pull/48489 Reviewed By: bdhirsh Differential Revision: D25272344 Pulled By: albanD fbshipit-source-id: b53c1fca1683f4aee5f4d5ce3cab9e559170e7cf --- test/test_autograd.py | 2 +- tools/autograd/gen_variable_type.py | 2 +- torch/csrc/autograd/FunctionsManual.cpp | 79 +++++++++++-------------- 3 files changed, 37 insertions(+), 46 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 796860cf639f..0d99169f4d65 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -4927,7 +4927,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks, 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul', 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub', 'exp', 'mean', 'inverse', 'triangular_solve', 'solve', 'addcmul', - 'addcdiv', 'linalg.tensorinv', 'matrix_exp'] + separate_complex_tests + 'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr', ] + separate_complex_tests def add_test( name, diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index 123b180f1774..a17e222f8cf1 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -78,7 +78,7 @@ 'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal', 'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_', 'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv', - 'matrix_exp', 'linalg_eigh', 'cholesky_solve', + 'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', '_fft_c2c', '_fft_r2c', } diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 4d71d6759e0c..6da1a7e5e934 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -2006,67 +2006,58 @@ Tensor qr_backward(const std::vector &grads, const Te const Tensor& A, const Tensor& Q, const Tensor& R) -> Tensor { - // For square and deep (tall) case we refer - // Walter, S.F and Lehmann, L., Algorithmic Differentiation of Linear - // Algebra Functions with Application in Optimum Experimental Design - // (Extended Version) The derivative for the QR decomposition is adapted - // from Eq. 42 of the above reference. - - // Compute R (R')^{T} + // For square and deep (tall) case we refer: + // Matthias Seeger, Asmus Hetzel, Zhenwen Dai, Eric Meissner, Neil D. Lawrence (2018). Auto-Differentiating Linear Algebra. + // https://arxiv.org/abs/1710.08717 Section 4.3 LQ Decomposition (Note that LQ decomposition is the transpose of QR decomposition) + // Hai-Jun Liao, Jin-Guo Liu, Lei Wang, Tao Xiang (2019). Differentiable Programming Tensor Networks. + // https://arxiv.org/abs/1903.09650 Section 3. QR factorization + // For derivations of complex-valued input case, see https://giggleliu.github.io/2019/04/02/einsumbp.html + + // Compute R grad_R^H Tensor R_term; if (grad_R.defined()) { - R_term = at::matmul(R, grad_R.transpose(-2, -1)); + R_term = at::matmul(R, grad_R.conj().transpose(-2, -1)); } else { // R is ... x N x N, grad_R is ... x N x N and grad_R.T is ... x N x N R_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT); } - // Compute Q^{T} Q' + // Compute grad_Q^H Q Tensor Q_term; if (grad_Q.defined()) { - Q_term = at::matmul(Q.transpose(-2, -1), grad_Q); + Q_term = at::matmul(grad_Q.conj().transpose(-2, -1), Q); } else { // Q is ... x M x N, Q.T is ... x N x M and grad_Q is ... x M x N Q_term = at::zeros_like(R, LEGACY_CONTIGUOUS_MEMORY_FORMAT); } - // We want to compute: (rhs_solve_1 . R^{-T}) - // Note that (rhs_solve_1 . R^{-T}) = (R^{-1} . rhs_solve_1^{T})^{T} + Tensor M = R_term - Q_term; + + // Compute M = (tril(M) + tril(M).conj().transpose(-2, -1)) * 0.5 Identity + Tensor M_tril = at::tril(M); + M = M_tril + M_tril.conj().transpose(-2, -1); + M.diagonal(0, -2, -1).mul_(0.5); + + Tensor rhs_term; + if (grad_Q.defined()) { + rhs_term = grad_Q + at::matmul(Q, M); + } else { + rhs_term = at::matmul(Q, M); + } + + // We want to compute: (rhs_term @ R^{-H}) + // Note that (rhs_term @ R^{-H}) = (R^{-1} @ rhs_solve_1^H)^H // Since R is upper triangular, we can do this using - // triangular_solve(rhs_solve_1^{T}, R)^{T} - auto rhs_solve_1 = - R_term - R_term.transpose(-2, -1) + Q_term - Q_term.transpose(-2, -1); - rhs_solve_1 = at::tril(rhs_solve_1, /*k=*/-1); - Tensor solve_soln_1; - std::tie(solve_soln_1, std::ignore) = at::triangular_solve( - rhs_solve_1.transpose(-2, -1), + // triangular_solve(rhs_term^H, R)^H + Tensor grad_A; + std::tie(grad_A, std::ignore) = at::triangular_solve( + rhs_term.conj().transpose(-2, -1), R, /*upper=*/true, /*transpose=*/false, /*unitriangular=*/false); - Tensor grad_A; - if (grad_R.defined()) { - grad_A = at::matmul(Q, solve_soln_1.transpose(-2, -1) + grad_R); - } else { - grad_A = at::matmul(Q, solve_soln_1.transpose(-2, -1)); - } - // Successive computations involve computation of QQ^{T} which is identity when A is square - if (A.size(-1) != A.size(-2)) { - Tensor rhs_solve_2; - // We use the same trick from above for this computation - if (grad_Q.defined()) { - rhs_solve_2 = grad_Q - at::matmul(Q, Q_term); - } else { - rhs_solve_2 = -at::matmul(Q, Q_term); - } - Tensor solve_soln_2; - std::tie(solve_soln_2, std::ignore) = at::triangular_solve(rhs_solve_2.transpose(-2, -1), R, - /*upper=*/true, /*transpose=*/false, - /*unitriangular=*/false); - grad_A.add_(solve_soln_2.transpose(-2, -1)); - } - return grad_A; + return grad_A.conj().transpose(-2, -1); }; auto m = self.size(-2); @@ -2087,7 +2078,7 @@ Tensor qr_backward(const std::vector &grads, const Te // grad_R = [grad_U | grad_V] and grad_A = [grad_X | grad_Y]. // To obtain grad_X we reuse the gradient formula from the square case. // Formulae: grad_X = square_case_grad(grad_Q_prime, grad_U, Q, U), - // where grad_Q_prime = grad_Q + Y @ grad_V.T + // where grad_Q_prime = grad_Q + Y @ grad_V^H // and grad_Y = Q @ grad_V. // Then concatenate grads to get grad_A = [grad_X | grad_Y]. @@ -2099,8 +2090,8 @@ Tensor qr_backward(const std::vector &grads, const Te grad_V = grad_R.narrow(-1, m, n - m); // reuse grad_R to store grad_U grad_R = grad_R.narrow(-1, 0, m); - // grad_Q_prime starts with the value of Y @ grad_V.T - grad_Q_prime = at::matmul(Y, grad_V.transpose(-2, -1)); + // grad_Q_prime starts with the value of Y @ grad_V^H + grad_Q_prime = at::matmul(Y, grad_V.conj().transpose(-2, -1)); } else { // when grad_R is not defined then grad_V and grad_Q_prime // get initialized with zeros From c6147ae4c99b13b1fbc8fb1b36deaca941bfd1c6 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 11 Dec 2020 14:16:22 -0800 Subject: [PATCH 192/250] [PyTorch] Fix getCustomClassType() perf (#48981) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48981 1) It was copying the entire hash table every time. 2) We don't need to do a hash lookup at all. ghstack-source-id: 118164406 Reviewed By: dzhulgakov Differential Revision: D25385543 fbshipit-source-id: 6be95c742d6713345c51859ce36a7791a9e2e3f0 --- aten/src/ATen/core/ivalue.cpp | 2 +- aten/src/ATen/core/ivalue.h | 14 +++++++++----- aten/src/ATen/core/ivalue_inl.h | 27 +++++++++++++++------------ aten/src/ATen/core/jit_type.h | 19 ++++++++++++------- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 6b8f4412cbf7..60382e37b6ff 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -22,7 +22,7 @@ namespace ivalue { // This is in ivalue.cpp because we need to access Type::annotation_str, which // is declared in jit_type.h -void checkCustomClassType(TypePtr expected_type, TypePtr actual_type) { +void checkCustomClassType(const Type* expected_type, const Type* actual_type) { // NB: doing pointer comparison here // If in the future there ever arises a need to call operator== on custom class // Type's, this needs to be changed! diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 9ea18dc8482d..d2e72933b532 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -949,8 +949,8 @@ TORCH_API ska::flat_hash_map& getCustomClassTypeMap(); template -c10::ClassTypePtr getCustomClassType() { - auto tmap = c10::getCustomClassTypeMap(); +c10::ClassTypePtr getCustomClassTypeImpl() { + auto& tmap = c10::getCustomClassTypeMap(); auto res = tmap.find(std::type_index(typeid(T))); if (res == tmap.end()) { throw c10::Error("Can't find class id in custom class type map", ""); @@ -959,9 +959,13 @@ c10::ClassTypePtr getCustomClassType() { } template -inline bool isCustomClassRegistered() { - auto tmap = c10::getCustomClassTypeMap(); - return tmap.find(std::type_index(typeid(T))) != tmap.end(); +const c10::ClassTypePtr& getCustomClassType() { + // Classes are never unregistered from getCustomClassTypeMap and the + // hash lookup can be a hot path, so just cache. + // For the same reason, it's fine If this ends up getting duplicated across + // DSO boundaries for whatever reason. + static c10::ClassTypePtr cache = getCustomClassTypeImpl(); + return cache; } TORCH_API std::unordered_map>& diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 8858d0047abd..b3b53aed994c 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -172,7 +172,7 @@ inline at::Generator IValue::toGenerator() const& { namespace ivalue { void CAFFE2_API -checkCustomClassType(TypePtr expected_type, TypePtr actual_type); +checkCustomClassType(const Type* expected_type, const Type* actual_type); template using Shared = c10::intrusive_ptr; @@ -820,8 +820,8 @@ c10::intrusive_ptr IValue::toCustomClass() && { obj->slots().size() == 1, "Tried to cast IValue to custom class but it did " "not contain a custom class!"); - auto expected_type = c10::getCustomClassType>(); - ivalue::checkCustomClassType(expected_type, type()); + const Type* expected_type = c10::getCustomClassType>().get(); + ivalue::checkCustomClassType(expected_type, type().get()); auto userObj = c10::static_intrusive_pointer_cast(obj->getSlot(0).toCapsule()); return userObj; @@ -838,8 +838,8 @@ c10::intrusive_ptr IValue::toCustomClass() const& { obj->slots().size() == 1, "Tried to cast IValue to custom class but it did " "not contain a custom class!"); - auto expected_type = c10::getCustomClassType>(); - ivalue::checkCustomClassType(expected_type, type()); + const Type* expected_type = c10::getCustomClassType>().get(); + ivalue::checkCustomClassType(expected_type, type().get()); auto userObj = c10::static_intrusive_pointer_cast(obj->getSlot(0).toCapsule()); return userObj; @@ -1149,13 +1149,16 @@ template < typename T, std::enable_if_t::value, int>> IValue::IValue(c10::intrusive_ptr custom_class) { - if (!c10::isCustomClassRegistered>()) { - throw c10::Error( - "Trying to instantiate a class that isn't a registered custom class: " + - std::string(c10::util::get_fully_qualified_type_name()), - ""); - } - auto classType = c10::getCustomClassType>(); + TypePtr classType = []() { + try { + return c10::getCustomClassType>(); + } catch (const c10::Error&) { + throw c10::Error( + "Trying to instantiate a class that isn't a registered custom class: " + + std::string(c10::util::get_fully_qualified_type_name()), + ""); + } + }(); auto ivalue_obj = c10::ivalue::Object::create( c10::StrongTypePtr(nullptr, classType), /*num_slots=*/1); ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class))); diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 1736ea91d71e..7fcd5c2d17e9 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -1727,13 +1727,18 @@ namespace detail { template struct getTypePtr_ final { static TypePtr call() { - TORCH_CHECK( - isCustomClassRegistered(), - "Type ", - c10::util::get_fully_qualified_type_name(), - " could not be converted to any of the known types." - ); - auto res = getCustomClassType(); + TypePtr res = []() { + try { + return getCustomClassType(); + } catch(const c10::Error&) { + TORCH_CHECK( + false, + "Type ", + c10::util::get_fully_qualified_type_name(), + " could not be converted to any of the known types." + ); + } + }(); return std::dynamic_pointer_cast(std::move(res)); } }; From df027bfd2c34503006ab985348e7205799c3f0fc Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Fri, 11 Dec 2020 14:51:51 -0800 Subject: [PATCH 193/250] Modify Pipe to return an RRef. (#47829) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47829 As per proposal in https://github.com/pytorch/pytorch/issues/44827, the API needs to return an RRef to support inter-host pipelining. For now, we just return a local RRef and only support pipeline on a single host. But having this change in the API upfront ensures we don't make any BC breaking changes later. ghstack-source-id: 118366784 Test Plan: waitforbuildbot Reviewed By: rohan-varma Differential Revision: D24914022 fbshipit-source-id: e711e7d12efa45645f752f0e5e776a3d845f3ef5 --- test/distributed/_pipeline/sync/conftest.py | 16 +++++ .../_pipeline/sync/skip/test_gpipe.py | 12 ++-- .../_pipeline/sync/skip/test_leak.py | 6 +- test/distributed/_pipeline/sync/test_bugs.py | 13 ++-- .../_pipeline/sync/test_inplace.py | 12 ++-- test/distributed/_pipeline/sync/test_pipe.py | 66 +++++++++---------- .../_pipeline/sync/test_transparency.py | 4 +- test/run_test.py | 22 +++++++ torch/distributed/_pipeline/sync/pipe.py | 11 ++-- 9 files changed, 101 insertions(+), 61 deletions(-) diff --git a/test/distributed/_pipeline/sync/conftest.py b/test/distributed/_pipeline/sync/conftest.py index 315431d0b644..561c41d11350 100644 --- a/test/distributed/_pipeline/sync/conftest.py +++ b/test/distributed/_pipeline/sync/conftest.py @@ -5,7 +5,9 @@ # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. import pytest +import tempfile import torch +from torch.distributed import rpc @pytest.fixture(autouse=True) @@ -35,3 +37,17 @@ def cuda_sleep(seconds): def pytest_report_header(): return f"torch: {torch.__version__}" + +@pytest.fixture +def setup_rpc(scope="session"): + file = tempfile.NamedTemporaryFile() + rpc.init_rpc( + name="worker0", + rank=0, + world_size=1, + rpc_backend_options=rpc.TensorPipeRpcBackendOptions( + init_method="file://{}".format(file.name), + ) + ) + yield + rpc.shutdown() diff --git a/test/distributed/_pipeline/sync/skip/test_gpipe.py b/test/distributed/_pipeline/sync/skip/test_gpipe.py index 96ecd84e0d18..90ecd7613d67 100644 --- a/test/distributed/_pipeline/sync/skip/test_gpipe.py +++ b/test/distributed/_pipeline/sync/skip/test_gpipe.py @@ -17,7 +17,7 @@ @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required") @pytest.mark.parametrize("balance", [[3], [1, 2], [2, 1], [1, 1, 1]], ids=["3", "1:2", "2:1", "1:1:1"]) @pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"]) -def test_1to3(balance, checkpoint): +def test_1to3(balance, checkpoint, setup_rpc): if torch.cuda.device_count() < len(balance): pytest.skip("at least %d cuda devices required" % len(balance)) @@ -61,14 +61,14 @@ def forward(self, input): input = torch.rand(30, 3, 224, 224, device=in_device, requires_grad=True) output = model(input) - loss = output.mean() + loss = output.local_value().mean() loss.backward() - assert torch.allclose(output.norm(), torch.tensor(1039.0, device=out_device), atol=6e-1) + assert torch.allclose(output.local_value().norm(), torch.tensor(1039.0, device=out_device), atol=6e-1) assert torch.allclose(input.grad.norm(), torch.tensor(0.0004533053, device=in_device)) -def test_none_skip(): +def test_none_skip(setup_rpc): @skippable(stash=["none"]) class Stash(nn.Module): def forward(self, input): @@ -102,7 +102,7 @@ def assert_grad_fn_is_not_portal(grad_fn, visited=None): for next_grad_fn, _ in grad_fn.next_functions: assert_grad_fn_is_not_portal(next_grad_fn, visited) - assert_grad_fn_is_not_portal(output.grad_fn) + assert_grad_fn_is_not_portal(output.local_value().grad_fn) - output.sum().backward() + output.local_value().sum().backward() assert input.grad.mean().item() == 1 diff --git a/test/distributed/_pipeline/sync/skip/test_leak.py b/test/distributed/_pipeline/sync/skip/test_leak.py index 31c4ea13b9f1..7d03a4e9db49 100644 --- a/test/distributed/_pipeline/sync/skip/test_leak.py +++ b/test/distributed/_pipeline/sync/skip/test_leak.py @@ -29,7 +29,7 @@ def forward(self, input): @pytest.mark.parametrize("train", [True, False], ids=["train", "eval"]) @pytest.mark.parametrize("checkpoint", ["always", "except_last", "never"]) -def test_delete_portal_tensor(train, checkpoint): +def test_delete_portal_tensor(train, checkpoint, setup_rpc): # Without checkpointing: # +- Stash --+ +--- Pop ----+ - - - layers # | 2,blue,1 |--| 1,orange,0 | - - - tensor_life and portal function @@ -97,7 +97,7 @@ def forward(self, input): if train: model.train() - output = model(input) + output = model(input).local_value() output.norm().backward() else: model.eval() @@ -106,7 +106,7 @@ def forward(self, input): @pytest.mark.parametrize("train", [True, False], ids=["train", "eval"]) -def test_no_portal_without_pipe(train, monkeypatch): +def test_no_portal_without_pipe(train, monkeypatch, setup_rpc): def deny(*args, **kwargs): raise AssertionError("tried to create Portal without Pipe") diff --git a/test/distributed/_pipeline/sync/test_bugs.py b/test/distributed/_pipeline/sync/test_bugs.py index 4f5346a837b5..a66b7d006ae1 100644 --- a/test/distributed/_pipeline/sync/test_bugs.py +++ b/test/distributed/_pipeline/sync/test_bugs.py @@ -12,7 +12,7 @@ from torch.distributed._pipeline.sync import Pipe -def test_python_autograd_function(): +def test_python_autograd_function(setup_rpc): # A Python autograd function might fail with this error: # # RuntimeError: Returning Variables sharing storage with other Variables @@ -41,10 +41,10 @@ def forward(self, input): x = torch.rand(42) y = model(x) - assert torch.allclose(x, y) + assert torch.allclose(x, y.local_value()) -def test_exception_no_hang(): +def test_exception_no_hang(setup_rpc): # In v0.0.2, once a failed partition receives a normal message # (non-closing) for the next micro-batch, a hang occured. The reason was # that a failed partition didn't call in_queue.task_done() on a normal @@ -69,7 +69,7 @@ def forward(self, x): @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="2 cuda devices required") -def test_tuple_wait(cuda_sleep): +def test_tuple_wait(cuda_sleep, setup_rpc): # In v0.0.3, Wait is applied to only the first tensor on a micro-batch. # Under this behavior, if checkpointing was disabled, there's a possibility # that gradient accumulations on other tensors are not synchronized @@ -113,7 +113,7 @@ def forward(self, triple): b = torch.rand(1024, 3, 32, 32, device=0, requires_grad=True) y = model((a, b)) - y.norm().backward() + y.local_value().norm().backward() torch.cuda.synchronize(0) torch.cuda.synchronize(1) @@ -121,7 +121,7 @@ def forward(self, triple): assert torch.isclose(b.grad.norm().cpu(), torch.tensor(5.000)) -def test_parallel_randoms(): +def test_parallel_randoms(setup_rpc): class Dropouts(nn.Module): def forward(self, x): for _ in range(100): @@ -133,6 +133,7 @@ def forward(self, x): x = torch.rand(10, 10, requires_grad=True) model = Pipe(model, chunks=10, checkpoint="always") y = model(x) + y = y.local_value() y.norm().backward() assert y.to(torch.bool).tolist() == x.grad.to(torch.bool).tolist() diff --git a/test/distributed/_pipeline/sync/test_inplace.py b/test/distributed/_pipeline/sync/test_inplace.py index 17b3dac4eca8..3b842dbfb9ab 100644 --- a/test/distributed/_pipeline/sync/test_inplace.py +++ b/test/distributed/_pipeline/sync/test_inplace.py @@ -11,12 +11,12 @@ from torch.distributed._pipeline.sync import Pipe -def test_inplace_on_requires_grad(): +def test_inplace_on_requires_grad(setup_rpc): model = nn.Sequential(nn.Linear(1, 1), nn.ReLU(inplace=True)) model = Pipe(model, checkpoint="always") x = torch.rand(1) - y = model(x) + y = model(x).local_value() message = r"a leaf Variable that requires grad .* used in an in-place operation." with pytest.raises(RuntimeError, match=message): @@ -24,14 +24,14 @@ def test_inplace_on_requires_grad(): @pytest.mark.xfail(strict=True) -def test_inplace_on_not_requires_grad(): +def test_inplace_on_not_requires_grad(setup_rpc): # In-place operation on a tensor not requiring grad doesn't cause a # RuntimeError. Currently, we cannot detect this case. model = nn.Sequential(nn.ReLU(inplace=True)) model = Pipe(model, [1], devices=["cpu"], checkpoint="always") x = torch.rand(1) - y = model(x) + y = model(x).local_value() del model message = r"a leaf Variable that requires grad .* used in an in-place operation." @@ -40,7 +40,7 @@ def test_inplace_on_not_requires_grad(): @pytest.mark.xfail(strict=True) -def test_inplace_incorrect_grad(): +def test_inplace_incorrect_grad(setup_rpc): class M(nn.Module): def forward(self, foo_bar): # 'foo' requires grad but 'bar' does not. In-place operation on @@ -62,7 +62,7 @@ def forward(self, foo_bar): foo = torch.tensor([1.0], requires_grad=True) bar = torch.tensor([1.0]) - output = model((foo, bar)) + output = model((foo, bar)).local_value() del model output.backward() diff --git a/test/distributed/_pipeline/sync/test_pipe.py b/test/distributed/_pipeline/sync/test_pipe.py index c0992c7bc0ed..8b87fa3d31f6 100644 --- a/test/distributed/_pipeline/sync/test_pipe.py +++ b/test/distributed/_pipeline/sync/test_pipe.py @@ -68,7 +68,7 @@ def test_chunks_less_than_1(): with pytest.raises(ValueError): Pipe(model, chunks=-1) -def test_batch_size_indivisible(): +def test_batch_size_indivisible(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=4) @@ -79,7 +79,7 @@ def test_batch_size_indivisible(): assert not record -def test_batch_size_small(): +def test_batch_size_small(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=4) @@ -90,7 +90,7 @@ def test_batch_size_small(): assert not record -def test_checkpoint_mode(): +def test_checkpoint_mode(setup_rpc): def count_grad_fn(grad_fn, name, visited=None): if visited is None: visited = set() @@ -119,9 +119,9 @@ def count_grad_fn(grad_fn, name, visited=None): except_last_output = except_last(input) never_output = never(input) - assert count_grad_fn(always_output.grad_fn, "CheckpointBackward") == 2 - assert count_grad_fn(except_last_output.grad_fn, "CheckpointBackward") == 1 - assert count_grad_fn(never_output.grad_fn, "CheckpointBackward") == 0 + assert count_grad_fn(always_output.local_value().grad_fn, "CheckpointBackward") == 2 + assert count_grad_fn(except_last_output.local_value().grad_fn, "CheckpointBackward") == 1 + assert count_grad_fn(never_output.local_value().grad_fn, "CheckpointBackward") == 0 def test_checkpoint_mode_invalid(): @@ -140,7 +140,7 @@ def test_checkpoint_mode_when_chunks_1(): Pipe(model, chunks=1, checkpoint="never") -def test_checkpoint_eval(): +def test_checkpoint_eval(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=2) input = torch.rand(2, 1) @@ -157,16 +157,16 @@ def find_grad_fn(grad_fn, name): model.train() train_output = model(input) - assert find_grad_fn(train_output.grad_fn, "CheckpointBackward") - assert find_grad_fn(train_output.grad_fn, "RecomputeBackward") + assert find_grad_fn(train_output.local_value().grad_fn, "CheckpointBackward") + assert find_grad_fn(train_output.local_value().grad_fn, "RecomputeBackward") model.eval() eval_output = model(input) - assert not find_grad_fn(eval_output.grad_fn, "CheckpointBackward") - assert not find_grad_fn(eval_output.grad_fn, "RecomputeBackward") + assert not find_grad_fn(eval_output.local_value().grad_fn, "CheckpointBackward") + assert not find_grad_fn(eval_output.local_value().grad_fn, "RecomputeBackward") -def test_checkpoint_non_float_input(): +def test_checkpoint_non_float_input(setup_rpc): class ForkNonFloat(nn.Module): def forward(self, input): return (input * 2, torch.tensor([False])) @@ -183,7 +183,7 @@ def forward(self, input): output.backward() -def test_no_grad(): +def test_no_grad(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model, chunks=2) input = torch.rand(2, 1) @@ -206,7 +206,7 @@ def hook(module, input, output): assert latent.grad_fn is None -def test_exception(): +def test_exception(setup_rpc): class ExpectedException(Exception): pass @@ -221,7 +221,7 @@ def forward(self, *_): model(torch.rand(1)) -def test_exception_early_stop_asap(): +def test_exception_early_stop_asap(setup_rpc): """Even the first partitions have finished to process, the partition before the failed partition should be killed as soon as possible. """ @@ -258,7 +258,7 @@ def forward(self, x): assert counter == 2 -def test_input_pair(): +def test_input_pair(setup_rpc): class Two(nn.Module): def __init__(self): super().__init__() @@ -275,7 +275,7 @@ def forward(self, a_and_b): a = torch.rand(10, 1, requires_grad=True) b = torch.rand(10, 1, requires_grad=True) - a_out, b_out = model((a, b)) + a_out, b_out = model((a, b)).local_value() loss = (a_out + b_out).mean() loss.backward() @@ -283,7 +283,7 @@ def forward(self, a_and_b): assert b.grad is not None -def test_input_singleton(): +def test_input_singleton(setup_rpc): class One(nn.Module): def __init__(self): super().__init__() @@ -298,7 +298,7 @@ def forward(self, only_a): a = torch.rand(10, 1, requires_grad=True) - (a_out,) = model((a,)) + (a_out,) = model((a,)).local_value() loss = a_out.mean() loss.backward() @@ -306,7 +306,7 @@ def forward(self, only_a): assert a.grad is not None -def test_input_varargs(): +def test_input_varargs(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) model = Pipe(model) @@ -318,7 +318,7 @@ def test_input_varargs(): model(a, b) -def test_non_tensor(): +def test_non_tensor(setup_rpc): class NonTensor(nn.Module): def forward(self, _): return "hello" @@ -336,7 +336,7 @@ def forward(self, _): model("hello") -def test_non_tensor_tuple(): +def test_non_tensor_tuple(setup_rpc): class NonTensorTuple(nn.Module): def forward(self, x): return (x, "hello") @@ -355,7 +355,7 @@ def forward(self, x): @pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"]) -def test_deferred_batch_norm(checkpoint): +def test_deferred_batch_norm(checkpoint, setup_rpc): bn = nn.BatchNorm2d(3) pipe_bn = deepcopy(bn) pipe = Pipe( @@ -363,7 +363,7 @@ def test_deferred_batch_norm(checkpoint): ) x = torch.rand(4, 3, 10, 10) - pipe(x).mean().backward() + pipe(x).local_value().mean().backward() bn(x).mean().backward() assert torch.allclose(pipe[0].running_mean, bn.running_mean, atol=1e-4) @@ -371,7 +371,7 @@ def test_deferred_batch_norm(checkpoint): @pytest.mark.parametrize("checkpoint", ["never", "always"]) -def test_deferred_batch_norm_params(checkpoint): +def test_deferred_batch_norm_params(checkpoint, setup_rpc): bn = nn.BatchNorm2d(3) pipe_bn = deepcopy(bn) pipe = Pipe( @@ -379,7 +379,7 @@ def test_deferred_batch_norm_params(checkpoint): ) x = torch.rand(4, 3, 10, 10) - pipe(x).mean().backward() + pipe(x).local_value().mean().backward() bn(x).mean().backward() assert pipe[0].weight.grad is not None @@ -455,13 +455,13 @@ def test_deny_moving(): model.to(dtype=torch.float) -def test_empty_module(): +def test_empty_module(setup_rpc): # Empty sequential module is not illegal. model = nn.Sequential() model = Pipe(model) - assert model(torch.tensor(42)) == torch.tensor(42) - assert model((torch.tensor(42),)) == (torch.tensor(42),) + assert model(torch.tensor(42)).local_value() == torch.tensor(42) + assert model((torch.tensor(42),)).local_value() == (torch.tensor(42),) # But only tensor or tensors is legal in Pipe. with pytest.raises(TypeError): @@ -518,7 +518,7 @@ def __init__(self, param1, param2): @pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda required") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need atleast two GPUs") -def test_verify_nested_modules(): +def test_verify_nested_modules(setup_rpc): model = nn.Sequential( nn.Sequential( nn.Linear(32, 16).cuda(0), @@ -532,8 +532,8 @@ def test_verify_nested_modules(): pipe = Pipe(model) out = pipe(torch.rand(10, 32).cuda(0)) - assert out.device == torch.device("cuda:1") - assert out.size() == torch.Size([10, 2]) + assert out.local_value().device == torch.device("cuda:1") + assert out.local_value().size() == torch.Size([10, 2]) def test_verify_module_duplicate_parameters_on_same_device(): class Surrogate(nn.Module): @@ -547,7 +547,7 @@ def __init__(self, module): Pipe(model) -def test_forward_lockstep(): +def test_forward_lockstep(setup_rpc): timeline = [] class DelayedLog(nn.Module): diff --git a/test/distributed/_pipeline/sync/test_transparency.py b/test/distributed/_pipeline/sync/test_transparency.py index 3d2c77e8fef4..56ad86de081b 100644 --- a/test/distributed/_pipeline/sync/test_transparency.py +++ b/test/distributed/_pipeline/sync/test_transparency.py @@ -10,7 +10,7 @@ from torch.distributed._pipeline.sync import Pipe -def test_simple_linears(): +def test_simple_linears(setup_rpc): def sum_grad(parameters): return sum([p.grad.sum() for p in parameters if p.grad is not None]) @@ -33,7 +33,7 @@ def zero_grad(parameters): # With Pipe model = Pipe(model, chunks=4) - outputs = model(inputs) + outputs = model(inputs).local_value() loss = outputs.mean() loss.backward() diff --git a/test/run_test.py b/test/run_test.py index 3687459a4a70..54cc33ebc484 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -162,6 +162,28 @@ 'distributed/rpc/test_process_group_agent', 'distributed/rpc/test_tensorpipe_agent', 'distributed/test_distributed_fork', + 'distributed/_pipeline/sync/skip/test_api', + 'distributed/_pipeline/sync/skip/test_gpipe', + 'distributed/_pipeline/sync/skip/test_inspect_skip_layout', + 'distributed/_pipeline/sync/skip/test_leak', + 'distributed/_pipeline/sync/skip/test_portal', + 'distributed/_pipeline/sync/skip/test_stash_pop', + 'distributed/_pipeline/sync/skip/test_tracker', + 'distributed/_pipeline/sync/skip/test_verify_skippables', + 'distributed/_pipeline/sync/test_balance', + 'distributed/_pipeline/sync/test_bugs', + 'distributed/_pipeline/sync/test_checkpoint', + 'distributed/_pipeline/sync/test_copy', + 'distributed/_pipeline/sync/test_deferred_batch_norm', + 'distributed/_pipeline/sync/test_dependency', + 'distributed/_pipeline/sync/test_inplace', + 'distributed/_pipeline/sync/test_microbatch', + 'distributed/_pipeline/sync/test_phony', + 'distributed/_pipeline/sync/test_pipe', + 'distributed/_pipeline/sync/test_pipeline', + 'distributed/_pipeline/sync/test_stream', + 'distributed/_pipeline/sync/test_transparency', + 'distributed/_pipeline/sync/test_worker', ] ROCM_BLOCKLIST = [ diff --git a/torch/distributed/_pipeline/sync/pipe.py b/torch/distributed/_pipeline/sync/pipe.py index 92a3c301cc39..a097e8aa1a9e 100644 --- a/torch/distributed/_pipeline/sync/pipe.py +++ b/torch/distributed/_pipeline/sync/pipe.py @@ -10,6 +10,7 @@ import torch from torch import Tensor, nn +from torch.distributed.rpc import RRef import torch.autograd import torch.cuda @@ -305,7 +306,7 @@ def _ensure_copy_streams(self) -> List[List[AbstractStream]]: return self._copy_streams - def forward(self, input: TensorOrTensors) -> TensorOrTensors: # type: ignore + def forward(self, input: TensorOrTensors) -> RRef[TensorOrTensors]: # type: ignore """:class:`Pipe` is a fairly transparent module wrapper. It doesn't modify the input and output signature of the underlying module. But there's type restriction. Input and output have to be a @@ -313,10 +314,10 @@ def forward(self, input: TensorOrTensors) -> TensorOrTensors: # type: ignore applied at partition boundaries too. Args: - input (torch.Tensor or tensors): input mini-batch + input (torch.Tensor or Tuple[torch.Tensor, ...]): input mini-batch Returns: - tensor or tensors: output mini-batch + :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch Raises: TypeError: input is not a tensor or tensors. @@ -326,7 +327,7 @@ def forward(self, input: TensorOrTensors) -> TensorOrTensors: # type: ignore if not self.devices: # Empty sequential module is not illegal. - return input + return RRef(input) # Divide a mini-batch into micro-batches. batches = microbatch.scatter(input, self.chunks) @@ -336,4 +337,4 @@ def forward(self, input: TensorOrTensors) -> TensorOrTensors: # type: ignore # Merge the micro-batches into one mini-batch. output = microbatch.gather(batches) - return output + return RRef(output) From 2f359e7d55f8de14fcd74231fc0f256d9fd8c607 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Fri, 11 Dec 2020 14:57:16 -0800 Subject: [PATCH 194/250] Add tensorpipe agent tests to multigpu tests. (#49210) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49210 The RPC tests use multiple gpus in some cases (ex: DDP + RPC and Pipe + DDP). We should enable multigpu tests for this purpose. ghstack-source-id: 118366595 Test Plan: waitforbuildbot Reviewed By: rohan-varma Differential Revision: D25485506 fbshipit-source-id: eabbf442471ebc700b5986bc751879b9cf72b752 --- .jenkins/pytorch/multigpu-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh index 9a2c486610c4..fdf3c03e7f67 100755 --- a/.jenkins/pytorch/multigpu-test.sh +++ b/.jenkins/pytorch/multigpu-test.sh @@ -21,4 +21,5 @@ time python test/run_test.py --verbose -i distributed/test_jit_c10d time python test/run_test.py --verbose -i distributed/test_distributed_fork time python test/run_test.py --verbose -i distributed/test_c10d time python test/run_test.py --verbose -i distributed/test_c10d_spawn +time python test/run_test.py --verbose -i distributed/rpc/test_tensorpipe_agent assert_git_not_dirty From 53aa9b8c829edfc4194259f2c14b194171074cf9 Mon Sep 17 00:00:00 2001 From: James Reed Date: Fri, 11 Dec 2020 15:43:04 -0800 Subject: [PATCH 195/250] [FX] Move none assignments to same line (#49209) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49209 Test Plan: Imported from OSS Reviewed By: Chillee Differential Revision: D25484975 Pulled By: jamesr66a fbshipit-source-id: 44207be878f95ec9420e87af79833191d5cc0c7e --- torch/fx/graph.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/torch/fx/graph.py b/torch/fx/graph.py index ca4b8d64bb0e..f8bc96b73c40 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -617,10 +617,15 @@ def delete_unused_values(user : Node): not used in the remainder of the code are freed and the memory usage of the code is optimal. """ + if user.op == 'output': + body.append('\n') + return nodes_to_delete = user_to_last_uses.get(user, []) if len(nodes_to_delete): to_delete_str = ' = '.join([n.name for n in nodes_to_delete] + ['None']) - body.append(f'{to_delete_str}\n') + body.append(f'; {to_delete_str}\n') + else: + body.append('\n') def emit_node(node : Node): if node.op == 'placeholder': @@ -630,20 +635,20 @@ def emit_node(node : Node): free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}') raw_name = node.target.replace('*', '') if raw_name != node.name: - body.append(f'{node.name} = {raw_name}\n') + body.append(f'{node.name} = {raw_name}') return elif node.op == 'call_method': assert isinstance(node.target, str) body.append( f'{node.name} = {_format_target(repr(node.args[0]), node.target)}' - f'({_format_args(node.args[1:], node.kwargs)})\n') + f'({_format_args(node.args[1:], node.kwargs)})') return elif node.op == 'call_function': assert callable(node.target) # pretty print operators if node.target.__module__ == '_operator' and node.target.__name__ in magic_methods: assert isinstance(node.args, tuple) - body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n') + body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}') return qualified_name = get_qualified_name(node.target) register_modules_used(qualified_name) @@ -652,26 +657,28 @@ def emit_node(node : Node): isinstance(node.args[1], str) and \ node.args[1].isidentifier(): # pretty print attribute access - body.append(f'{node.name} = {_format_target(repr(node.args[0]), node.args[1])}\n') + body.append(f'{node.name} = {_format_target(repr(node.args[0]), node.args[1])}') return - body.append(f'{node.name} = {qualified_name}({_format_args(node.args, node.kwargs)})\n') + body.append(f'{node.name} = {qualified_name}({_format_args(node.args, node.kwargs)})') return elif node.op == 'call_module': assert isinstance(node.target, str) - body.append(f'{node.name} = {_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})\n') + body.append(f'{node.name} = {_format_target(root_module, node.target)}({_format_args(node.args, node.kwargs)})') return elif node.op == 'get_attr': assert isinstance(node.target, str) - body.append(f'{node.name} = {_format_target(root_module, node.target)}\n') + body.append(f'{node.name} = {_format_target(root_module, node.target)}') return elif node.op == 'output': if node.type is not None: maybe_return_annotation = f" -> {type_repr(node.type)}" - body.append(f'return {repr(node.args[0])}\n') + body.append(f'return {repr(node.args[0])}') return raise NotImplementedError(f'node: {node.op} {node.target}') for node in self.nodes: + # NOTE: emit_node does not emit a string with newline. It depends + # on delete_unused_values to append one emit_node(node) delete_unused_values(node) From bfce69d6200ecf1261bf8d45657c802c56317365 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Fri, 11 Dec 2020 15:45:26 -0800 Subject: [PATCH 196/250] inline `has` function for DispatchKeySet (#49191) Summary: inlines `has` function for DispatchKeySet, that is frequently used in TensorImpl in calls such as `is_sparse`, `is_cuda` etc. This increases `empty` instruction count (1853228 -> 1937428) without appreciable effect on runtime, and noticeably reduces instruction counts for `copy_` and friends that have to rely on `is_sparse`, `is_cuda` and the like a lot to decide which path to take (3269114 -> 2634114). Pull Request resolved: https://github.com/pytorch/pytorch/pull/49191 Reviewed By: H-Huang Differential Revision: D25483011 Pulled By: ngimel fbshipit-source-id: 2f3ab83e2c836a726b9284ffc50d6ecf3701aada --- c10/core/DispatchKeySet.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 1e9d85211f6d..486272ece92e 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -61,8 +61,8 @@ class DispatchKeySet final { } } // Test if a DispatchKey is in the set - bool has(DispatchKey t) const { - TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); + bool inline has(DispatchKey t) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined); return static_cast(repr_ & DispatchKeySet(t).repr_); } // Test if DispatchKeySet is a superset of ks. From 5716b7db72e8f66b7b2ab312cb3623b87aeb89d8 Mon Sep 17 00:00:00 2001 From: Iurii Zdebskyi Date: Fri, 11 Dec 2020 15:46:00 -0800 Subject: [PATCH 197/250] Enabled Scalar lists (#48222) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48222 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25074765 Pulled By: izdeby fbshipit-source-id: 96ebe3c9907178c9338c03fb7993b2ecb26db8f4 --- .../impl/make_boxed_from_unboxed_functor.h | 8 ---- tools/codegen/api/python.py | 5 ++- tools/jit/gen_unboxing_wrappers.py | 2 + tools/pyi/gen_pyi.py | 1 - torch/csrc/utils/python_arg_parser.cpp | 44 ++++++++++++++----- torch/csrc/utils/python_arg_parser.h | 17 ++++++- 6 files changed, 56 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index 3dfb4ee4f04b..3d040387d3bb 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -119,14 +119,6 @@ namespace impl { "You tried to register a kernel with an unsupported input type: List. Please use List, List or Tensor instead."); }; - template - struct assert_is_valid_input_type, AllowDeprecatedTypes> - : assert_is_valid_input_type { - static_assert(!std::is_same::value, - "You tried to register a kernel with an unsupported input type: std::vector. Please use List, List or Tensor instead."); - // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel with an unsupported input type: std::vector. Please use List instead."); - }; - template struct assert_is_valid_input_type, AllowDeprecatedTypes> : assert_is_valid_input_type { diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 10483e2e3d76..c78fe23150e8 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -620,6 +620,8 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str: return f'IntArrayRef[{size}]' if size is not None else 'IntArrayRef' elif str(t.elem) == 'Tensor': return f'TensorList[{size}]' if size is not None else 'TensorList' + elif str(t.elem) == 'Scalar': + return f'ScalarList[{size}]' if size is not None else 'ScalarList' elif str(t.elem) == 'Tensor?': if simple_type: return 'TensorList' @@ -1063,7 +1065,8 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str: return 'intlist' elif str(t) == 'float[]': return 'doublelist' - + elif str(t) == 'Scalar[]': + return 'scalarlist' raise RuntimeError(f'type \'{t}\' is not supported by PythonArgParser') # Return RHS expression for python argument using PythonArgParser output. diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py index f2896fac7f22..267b5a3b221a 100644 --- a/tools/jit/gen_unboxing_wrappers.py +++ b/tools/jit/gen_unboxing_wrappers.py @@ -49,6 +49,7 @@ 'std::string': 'str', 'std::string?': 'str?', 'Scalar': 'Scalar', + 'ScalarList': 'Scalar[]', 'MemoryFormat': 'MemoryFormat', 'MemoryFormat?': 'MemoryFormat?', 'QScheme': 'QScheme', @@ -131,6 +132,7 @@ def jit_type_of(arg): 'Tensor?': 'toOptionalTensor({})', 'Tensor?[]': 'toListOfOptionalTensor({})', 'TensorList': '{}.toTensorVector()', + 'ScalarList': '{}.toScalarVector()', 'bool': '{}.toBool()', 'bool?': '{}.toOptional()', 'double': '{}.toDouble()', diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index dad150fa0ad5..d2073bec9a27 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -122,7 +122,6 @@ def should_bind_method(python_func: PythonSignatureNativeFunctionPair) -> bool: 'floor_divide', 'floor_divide_', 'floor_divide_out', ] - binary_ops = ('add', 'sub', 'mul', 'div', 'pow', 'lshift', 'rshift', 'mod', 'truediv', 'matmul', 'floordiv', 'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rpow', # reverse arithmetic diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index 950e7d9fb82d..c7fdf844945e 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -39,6 +39,7 @@ static std::unordered_map type_map = { {"std::string", ParameterType::STRING}, {"Dimname", ParameterType::DIMNAME}, {"DimnameList", ParameterType::DIMNAME_LIST}, + {"ScalarList", ParameterType::SCALAR_LIST}, }; // Default arg name translations for compatibility with NumPy. @@ -348,13 +349,28 @@ bool is_tensor_and_append_overloaded(PyObject* obj, std::vector* ove return false; } -bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector* overloaded_args, int argnum, bool throw_error) { +bool is_scalar_list(PyObject* obj) { auto tuple = six::isTuple(obj); if (!(tuple || PyList_Check(obj))) { return false; } auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj); for (size_t idx = 0; idx < size; idx++) { + PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx); + if (!THPUtils_checkScalar(iobj)) { + return false; + } + } + return true; +} + +bool is_tensor_list_and_append_overloaded(PyObject* obj, std::vector* overloaded_args, int argnum, bool throw_error) { + auto tuple = six::isTuple(obj); + if (!(tuple || PyList_Check(obj))) { + return false; + } + auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj); + for (long idx = 0; idx < size; idx++) { PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx); if (!is_tensor_and_append_overloaded(iobj, overloaded_args)) { if (throw_error) { @@ -453,6 +469,9 @@ auto FunctionParameter::check(PyObject* obj, std::vector &overloaded return THPStream_Check(obj); case ParameterType::STRING: return THPUtils_checkString(obj); default: throw std::runtime_error("unknown parameter type"); + case ParameterType::SCALAR_LIST: { + return is_scalar_list(obj); + } } } @@ -478,6 +497,7 @@ std::string FunctionParameter::type_name() const { case ParameterType::STRING: return "str"; case ParameterType::DIMNAME: return "name"; case ParameterType::DIMNAME_LIST: return "tuple of names"; + case ParameterType::SCALAR_LIST: return "tuple of Scalars"; default: throw std::runtime_error("unknown parameter type"); } } @@ -1055,24 +1075,28 @@ at::Scalar PythonArgs::scalar_slow(int i) { signature.params[i].name, idx, var, jit::NumberType::get()); } + return scalar_slow(args[i]); +} + +at::Scalar PythonArgs::scalar_slow(PyObject* arg) { // Zero-dim tensors are converted to Scalars as-is. Note this doesn't currently // handle most NumPy scalar types except np.float64. - if (THPVariable_Check(args[i])) { - return ((THPVariable*)args[i])->cdata.item(); + if (THPVariable_Check(arg)) { + return ((THPVariable*)arg)->cdata.item(); } - if (THPUtils_checkLong(args[i])) { - return at::Scalar(static_cast(THPUtils_unpackLong(args[i]))); + if (THPUtils_checkLong(arg)) { + return at::Scalar(static_cast(THPUtils_unpackLong(arg))); } - if (PyBool_Check(args[i])) { - return at::Scalar(THPUtils_unpackBool(args[i])); + if (PyBool_Check(arg)) { + return at::Scalar(THPUtils_unpackBool(arg)); } - if (PyComplex_Check(args[i])) { - return at::Scalar(THPUtils_unpackComplexDouble(args[i])); + if (PyComplex_Check(arg)) { + return at::Scalar(THPUtils_unpackComplexDouble(arg)); } - return at::Scalar(THPUtils_unpackDouble(args[i])); + return at::Scalar(THPUtils_unpackDouble(arg)); } } // namespace torch diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index b0b81a9517da..ccf3ba6b42c4 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -80,7 +80,7 @@ namespace torch { enum class ParameterType { TENSOR, SCALAR, INT64, DOUBLE, COMPLEX, TENSOR_LIST, INT_LIST, GENERATOR, BOOL, STORAGE, PYOBJECT, SCALARTYPE, LAYOUT, MEMORY_FORMAT, DEVICE, STREAM, STRING, - DIMNAME, DIMNAME_LIST, QSCHEME, FLOAT_LIST + DIMNAME, DIMNAME_LIST, QSCHEME, FLOAT_LIST, SCALAR_LIST }; struct FunctionParameter; @@ -158,6 +158,7 @@ struct PythonArgs { inline c10::optional optionalTensor(int i); inline at::Scalar scalar(int i); inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar); + inline std::vector scalarlist(int i); inline std::vector tensorlist(int i); template inline std::array tensorlist_n(int i); @@ -206,6 +207,7 @@ struct PythonArgs { private: at::Tensor tensor_slow(int i); at::Scalar scalar_slow(int i); + at::Scalar scalar_slow(PyObject* arg); }; struct FunctionParameter { @@ -287,6 +289,19 @@ inline at::Scalar PythonArgs::scalar(int i) { return scalar_slow(i); } +inline std::vector PythonArgs::scalarlist(int i) { + if (!args[i]) return std::vector(); + auto tuple = six::isTuple(args[i]); + THPObjectPtr arg = six::maybeAsTuple(args[i]); + auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get()); + std::vector res(size); + for (int idx = 0; idx < size; idx++) { + PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx) : PyList_GET_ITEM(arg.get(), idx); + res[idx] = scalar_slow(obj); + } + return res; +} + inline at::Scalar PythonArgs::scalarWithDefault(int i, at::Scalar default_scalar) { if (!args[i]) return default_scalar; return scalar_slow(i); From 6b7864462387e14176431efb525f538c1be4d255 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Fri, 11 Dec 2020 16:02:04 -0800 Subject: [PATCH 198/250] [te] Add BitCast to the IR (#49184) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49184 Adds BitCasting to NNC. This will enable fast approximation algorithms implemented directly in TensorExpressions Test Plan: buck test mode/no-gpu //caffe2/test/cpp/tensorexpr:tensorexpr Reviewed By: bertmaher Differential Revision: D25466476 fbshipit-source-id: f063ab29ba7bab2dcce463e499f2d4a16bdc1f0e --- test/cpp/tensorexpr/test_llvm.cpp | 83 ++++++++++++++++ test/cpp/tensorexpr/test_type.cpp | 110 +++++++++++++++++++++ torch/csrc/jit/tensorexpr/eval.h | 60 +++++++++++ torch/csrc/jit/tensorexpr/expr.h | 1 + torch/csrc/jit/tensorexpr/ir.h | 29 ++++++ torch/csrc/jit/tensorexpr/ir_mutator.cpp | 9 ++ torch/csrc/jit/tensorexpr/ir_mutator.h | 2 + torch/csrc/jit/tensorexpr/ir_visitor.cpp | 3 + torch/csrc/jit/tensorexpr/ir_visitor.h | 2 + torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 20 ++++ torch/csrc/jit/tensorexpr/loopnest.cpp | 8 ++ 11 files changed, 327 insertions(+) diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 953c184de1fc..c1d3392fff32 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -160,6 +160,63 @@ TEST(LLVM, ByteToDoubleCastTest) { ASSERT_EQ(cg.value(), 2); } +TEST(LLVM, BitCast) { + constexpr int16_t ref16 = 1337; + constexpr int32_t ref32 = 1337; + constexpr int64_t ref64 = 1337; + at::Half reff16 = 1337.0f; + constexpr float reff32 = 1337.0f; + constexpr double reff64 = 1337.0f; + + // this is broken + /*{ + KernelScope kernel_scope; + at::Half k_; + at::Half* k = &k_; + *reinterpret_cast(k) = ref16; + auto a = HalfImm::make(k); + auto b = BitCast::make(kShort, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), ref16); + }*/ + + { + KernelScope kernel_scope; + float k = raw_bitcast(ref32); + auto a = FloatImm::make(k); + auto b = BitCast::make(kInt, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), ref32); + } + + { + KernelScope kernel_scope; + double k = raw_bitcast(ref64); + auto a = DoubleImm::make(k); + auto b = BitCast::make(kLong, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), ref64); + } + + { + KernelScope kernel_scope; + int64_t k = raw_bitcast(reff64); + auto a = LongImm::make(k); + auto b = BitCast::make(kDouble, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), reff64); + } + + { + KernelScope kernel_scope; + int32_t k = raw_bitcast(reff32); + auto a = IntImm::make(k); + auto b = BitCast::make(kFloat, a); + LLVMExprEval cg(b); + ASSERT_EQ(cg.value(), reff32); + } +} + TEST(LLVM, LetTest01) { KernelScope kernel_scope; @@ -514,6 +571,32 @@ TEST(LLVM, VectorizerLoadStoreTest) { assertAllEqual(c_vec, 21); } +TEST(LLVM, VectorizeBitCast) { + KernelScope kernel_scope; + Placeholder a(BufHandle("A", {128}, kInt)); + + Tensor* c = Compute("c", {{128, "i"}}, [&](const VarHandle& i) { + return bitcast(a.load(i)); + }); + + Placeholder c_buf(BufHandle(c->buf())); + LoopNest l({c}); + Stmt* s = l.root_stmt(); + l.vectorize(dynamic_cast(s)->front()); + ASSERT_TRUE(dynamic_cast(dynamic_cast(s)->front()) == nullptr); + + LLVMCodeGen cg(s, {a, c_buf}); + + std::vector a_vec(128); + std::vector c_vec(128); + for (auto i = 0; i < 128; ++i) { + a_vec[i] = raw_bitcast(1337.f); + } + std::vector args({a_vec.data(), c_vec.data()}); + ASSERT_EQ(cg.value(args), 0); + assertAllEqual(c_vec, 1337.f); +} + TEST(LLVM, MemcpyTest) { KernelScope kernel_scope; constexpr int N = 32; diff --git a/test/cpp/tensorexpr/test_type.cpp b/test/cpp/tensorexpr/test_type.cpp index 0c771733d935..71ad0f5149ac 100644 --- a/test/cpp/tensorexpr/test_type.cpp +++ b/test/cpp/tensorexpr/test_type.cpp @@ -1,5 +1,6 @@ #include +#include "torch/csrc/jit/tensorexpr/eval.h" #include "torch/csrc/jit/tensorexpr/ir.h" #include "torch/csrc/jit/tensorexpr/tensor.h" @@ -42,6 +43,115 @@ TEST(Type, Test01) { } } +TEST(Type, BitCasting) { + { + KernelScope kernel_scope; + VarHandle x("x", kFloat); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kInt); + } + { + KernelScope kernel_scope; + VarHandle x("x", kInt); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kFloat); + } + { + KernelScope kernel_scope; + VarHandle x("x", kShort); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kHalf); + } + { + KernelScope kernel_scope; + VarHandle x("x", kHalf); + ExprHandle y = bitcast(x); + ASSERT_EQ(y.dtype(), kShort); + } + + constexpr int16_t ref16 = 1337; + constexpr int32_t ref32 = 1337; + constexpr int64_t ref64 = 1337; + at::Half reff16 = 1337.0f; + constexpr float reff32 = 1337.0f; + constexpr double reff64 = 1337.0f; + using SimpleIRExprEval = ExprEval; + // this is broken + /*{ + KernelScope kernel_scope; + at::Half k_; + at::Half* k = &k_; + *reinterpret_cast(k) = ref16; + auto a = HalfImm::make(*k); + auto b = BitCast::make(kShort, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), ref16); + }*/ + + { + KernelScope kernel_scope; + float k = raw_bitcast(ref32); + auto a = FloatImm::make(k); + auto b = BitCast::make(kInt, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), ref32); + } + + { + KernelScope kernel_scope; + double k = raw_bitcast(ref64); + auto a = DoubleImm::make(k); + auto b = BitCast::make(kLong, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), ref64); + } + + { + KernelScope kernel_scope; + int64_t k = raw_bitcast(reff64); + auto a = LongImm::make(k); + auto b = BitCast::make(kDouble, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), reff64); + } + + { + KernelScope kernel_scope; + int32_t k = raw_bitcast(reff32); + auto a = IntImm::make(k); + auto b = BitCast::make(kFloat, a); + SimpleIRExprEval cg(b); + ASSERT_EQ(cg.value(), reff32); + } + + // This segfaults :( + /*{ + KernelScope kernel_scope; + VarHandle x("x", kDouble); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kFloat); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kLong); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kShort); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + } + { + KernelScope kernel_scope; + VarHandle x("x", kInt); + ASSERT_ANY_THROW(ExprHandle y = bitcast(x)); + }*/ +} + TEST(Type, Propagation) { // Same types: { diff --git a/torch/csrc/jit/tensorexpr/eval.h b/torch/csrc/jit/tensorexpr/eval.h index 7b8a4c194782..e7fbd376d563 100644 --- a/torch/csrc/jit/tensorexpr/eval.h +++ b/torch/csrc/jit/tensorexpr/eval.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -124,6 +125,14 @@ inline c10::Half div_value(c10::Half lhs, c10::Half rhs) { return lhs / rhs; } +template +To raw_bitcast(const From& src) { + TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation"); + To storage; + std::memcpy(&storage, &src, sizeof(From)); + return reinterpret_cast(storage); +} + class SimpleIREvaluator : public CodeGen, public IRVisitor { public: template @@ -573,6 +582,57 @@ class SimpleIREvaluator : public CodeGen, public IRVisitor { } } + template + std::vector bitcastValues(const Dtype& src_dtype, const Value& v) { + const std::vector& src_values = v.as_vec(); + std::vector dst_values(src_values.size()); + for (int i = 0; i < src_dtype.lanes(); ++i) { + dst_values[i] = raw_bitcast(src_values[i]); + } + return dst_values; + } + + template + void doBitCastFromSrc( + const Dtype& src_dtype, + const Dtype& dst_dtype, + const Value& v) { + switch (dst_dtype.scalar_type()) { +#define DST_TYPE_CASE(Type, Name) \ + case ScalarType::Name: \ + this->value_ = Value(bitcastValues(src_dtype, v)); \ + break; + // bool/half not supported + AT_FORALL_SCALAR_TYPES(DST_TYPE_CASE); +#undef DST_TYPE_CASE + default: + throw unsupported_dtype(); + } + } + + TORCH_API void visit(const BitCast* v) override { + const Expr* src_value = v->src_value(); + src_value->accept(this); + Dtype dst_dtype = v->dtype(); + Dtype src_dtype = src_value->dtype(); + if (src_dtype.byte_size() != dst_dtype.byte_size()) { + throw malformed_input("lane mismatch in Cast", v); + } + if (src_dtype != dst_dtype) { + switch (src_dtype.scalar_type()) { +#define SRC_TYPE_CASE(Type, Name) \ + case ScalarType::Name: \ + doBitCastFromSrc(src_dtype, dst_dtype, value_); \ + break; + // bool/half not supported + AT_FORALL_SCALAR_TYPES(SRC_TYPE_CASE); +#undef SRC_TYPE_CASE + default: + throw unsupported_dtype(); + } + } + } + TORCH_API void visit(const For* v) override { const Expr* var_node = v->var(); v->start()->accept(this); diff --git a/torch/csrc/jit/tensorexpr/expr.h b/torch/csrc/jit/tensorexpr/expr.h index 9b8dd23db0b1..cd05333656c0 100644 --- a/torch/csrc/jit/tensorexpr/expr.h +++ b/torch/csrc/jit/tensorexpr/expr.h @@ -31,6 +31,7 @@ enum IRNodeType { kCompareSelect, kLet, kCast, + kBitCast, kBroadcast, kRamp, kPolynomial, diff --git a/torch/csrc/jit/tensorexpr/ir.h b/torch/csrc/jit/tensorexpr/ir.h index 7eeea564a6a7..6fe4bf0e2ebd 100644 --- a/torch/csrc/jit/tensorexpr/ir.h +++ b/torch/csrc/jit/tensorexpr/ir.h @@ -28,6 +28,7 @@ inline int getPrecedence(IRNodeType ty) { case kPrimitive: return 0; case kCast: + case kBitCast: return 2; case kAdd: case kSub: @@ -81,6 +82,34 @@ ExprHandle cast(const ExprHandle& src_value) { return Cast::make(Dtype(ToDtype(), src_value.dtype().lanes()), src_value); } +// This is a bitwise cast, akin to bitcast in LLVM +class BitCast : public ExprNode { + public: + const Expr* src_value() const { + return src_value_; + } + static ExprHandle make(Dtype dtype, const ExprHandle& src_value) { + return ExprHandle(new BitCast(dtype, src_value.node())); + } + BitCast(Dtype dtype, const Expr* src_value) + : ExprNodeBase(dtype, kBitCast), src_value_(src_value) { + TORCH_CHECK(src_value_->dtype().byte_size() == dtype.byte_size()); + } + + bool isConstant() const override { + return src_value_->isConstant(); + } + + private: + const Expr* src_value_; +}; + +template +ExprHandle bitcast(const ExprHandle& src_value) { + return BitCast::make( + Dtype(ToDtype(), src_value.dtype().lanes()), src_value); +} + // Represent the expression node for binary operators. // A CRTP pattern to share common code among the operators. template diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.cpp b/torch/csrc/jit/tensorexpr/ir_mutator.cpp index 5f0889842b1e..ddbe88bb2c8f 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.cpp +++ b/torch/csrc/jit/tensorexpr/ir_mutator.cpp @@ -139,6 +139,15 @@ const Expr* IRMutator::mutate(const Cast* v) { return new Cast(v->dtype(), src_value_new); } +const Expr* IRMutator::mutate(const BitCast* v) { + const Expr* src_value = v->src_value(); + const Expr* src_value_new = src_value->accept_mutator(this); + if (src_value_new == v->src_value()) { + return v; + } + return new BitCast(v->dtype(), src_value_new); +} + const Expr* IRMutator::mutate(const Var* v) { return v; } diff --git a/torch/csrc/jit/tensorexpr/ir_mutator.h b/torch/csrc/jit/tensorexpr/ir_mutator.h index 0913da0e972d..773920cb52fa 100644 --- a/torch/csrc/jit/tensorexpr/ir_mutator.h +++ b/torch/csrc/jit/tensorexpr/ir_mutator.h @@ -26,6 +26,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE); #undef IMM_DECLARE class Cast; +class BitCast; class Var; class Buf; class Ramp; @@ -75,6 +76,7 @@ class TORCH_API IRMutator { AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_MUTATE_DECLARE); #undef IMM_MUTATE_DECLARE virtual const Expr* mutate(const Cast* v); + virtual const Expr* mutate(const BitCast* v); virtual const Expr* mutate(const Var* v); virtual const Expr* mutate(const Buf* v); virtual const Expr* mutate(const Ramp* v); diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.cpp b/torch/csrc/jit/tensorexpr/ir_visitor.cpp index ae97a6200d8b..772a28c77add 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.cpp +++ b/torch/csrc/jit/tensorexpr/ir_visitor.cpp @@ -79,6 +79,9 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_VISIT); void IRVisitor::visit(const Cast* v) { v->src_value()->accept(this); } +void IRVisitor::visit(const BitCast* v) { + v->src_value()->accept(this); +} void IRVisitor::visit(const Var* v) {} void IRVisitor::visit(const Ramp* v) { diff --git a/torch/csrc/jit/tensorexpr/ir_visitor.h b/torch/csrc/jit/tensorexpr/ir_visitor.h index 3f5f05229c16..8353da680edb 100644 --- a/torch/csrc/jit/tensorexpr/ir_visitor.h +++ b/torch/csrc/jit/tensorexpr/ir_visitor.h @@ -26,6 +26,7 @@ AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, IMM_DECLARE) #undef IMM_DECLARE class Cast; +class BitCast; class Var; class Buf; class Ramp; @@ -74,6 +75,7 @@ class TORCH_API IRVisitor { #undef IMM_PRINT_VISIT virtual void visit(const Cast* v); + virtual void visit(const BitCast* v); virtual void visit(const Var* v); virtual void visit(const Buf* v); virtual void visit(const Ramp* v); diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index cb14b9ef4c07..d469a39cf69d 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -164,6 +164,7 @@ class LLVMCodeGenImpl : public IRVisitor { #undef IMM_VISIT_DECLARE void visit(const Cast* v) override; + void visit(const BitCast* v) override; void visit(const Var* v) override; void visit(const Ramp* v) override; void visit(const Load* v) override; @@ -888,6 +889,25 @@ void LLVMCodeGenImpl::visit(const Cast* v) { } } +void LLVMCodeGenImpl::visit(const BitCast* v) { + v->src_value()->accept(this); + + llvm::Type* dstType = dtypeToLLVM(v->dtype()); + if (v->dtype().lanes() > 1) { + dstType = llvm::VectorType::get(dstType, ElementCount(v->dtype().lanes())); + } + llvm::Type* srcType = dtypeToLLVM(v->src_value()->dtype()); + + if (srcType == dstType) { + // do nothing. + return; + } + + TORCH_CHECK(llvm::CastInst::isBitCastable( + srcType->getScalarType(), dstType->getScalarType())); + value_ = irb_.CreateBitOrPointerCast(value_, dstType); +} + void LLVMCodeGenImpl::visit(const Var* v) { if (varToArg_.count(v)) { auto idx = varToArg_.at(v); diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp index 0bff2dbf75c7..1598a92ac68c 100644 --- a/torch/csrc/jit/tensorexpr/loopnest.cpp +++ b/torch/csrc/jit/tensorexpr/loopnest.cpp @@ -154,6 +154,14 @@ class Vectorizer : public IRMutator { }); } + const Expr* mutate(const BitCast* v) override { + std::vector inputs = {v->src_value()}; + return try_vectorize(v, inputs, [&]() { + return BitCast::make( + Dtype(v->dtype().scalar_type(), lanes_), ExprHandle(inputs[0])); + }); + } + const Expr* mutate(const Cast* v) override { std::vector inputs = {v->src_value()}; return try_vectorize(v, inputs, [&]() { From 21c38e17997171415a44c6ba578f621037d8ef30 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Fri, 11 Dec 2020 17:20:51 -0800 Subject: [PATCH 199/250] Additional validation for DistributedSampler. (#48865) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48865 If DistributedSampler was provided an invalid rank (ex: https://discuss.pytorch.org/t/distributed-datasets-on-multi-machines/105113), it failed with a cryptic assertion failure. To fix this issue, I've added an additional check to DistributedSampler to validate we provide a valid rank. ghstack-source-id: 117906769 Test Plan: 1) waitforbuildbot 2) Unit test added. Reviewed By: malfet Differential Revision: D25344945 fbshipit-source-id: 7685e00c8b2c200efbd2949fb32ee32ea7232a08 --- test/test_dataloader.py | 9 +++++++++ torch/utils/data/distributed.py | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/test/test_dataloader.py b/test/test_dataloader.py index a1afc216d42a..047297c438b7 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -1454,6 +1454,15 @@ def test_random_sampler_len_with_replacement(self): self.assertEqual(int(math.ceil(float(num_samples) / batch_size)), count_num_samples_in_data_loader) + def test_distributed_sampler_invalid_rank(self): + from torch.utils.data.distributed import DistributedSampler + dataset = torch.IntTensor(range(10)) + with self.assertRaisesRegex(ValueError, "Invalid rank"): + sampler = DistributedSampler(dataset, 3, 3) + + with self.assertRaisesRegex(ValueError, "Invalid rank"): + sampler = DistributedSampler(dataset, 3, -1) + def test_duplicating_data_with_drop_last(self): from torch.utils.data.distributed import DistributedSampler diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py index cb67625df518..e048b54a462c 100644 --- a/torch/utils/data/distributed.py +++ b/torch/utils/data/distributed.py @@ -67,6 +67,10 @@ def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None, if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") rank = dist.get_rank() + if rank >= num_replicas or rank < 0: + raise ValueError( + "Invalid rank {}, rank should be in the interval" + " [0, {}]".format(rank, num_replicas - 1)) self.dataset = dataset self.num_replicas = num_replicas self.rank = rank From 29f0fa36b1f78117a60378a8e5df5c284e1e346d Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 11 Dec 2020 17:43:59 -0800 Subject: [PATCH 200/250] [Gradient Compression] Minor update of the comments on PowerSGD. (#49246) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49246 Previously the comment on matrix_approximation_rank was in PowerSGD_hook function. Now move it into PowerSGDState, because the function arg is already moved to this state as an attribute. Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 ghstack-source-id: 118414247 Test Plan: N/A Reviewed By: rohan-varma Differential Revision: D25501091 fbshipit-source-id: 701e3109a9a3f2a5f9d18d5bf6d0a266518ee8ea --- torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 99ba72cc5868..bbcef98d4214 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -47,6 +47,8 @@ def __init__( random_seed=0, ): self.process_group = process_group + # The low rank for matrix approximation. + # Typically only 1 or 2 is used. See https://arxiv.org/pdf/1905.13727.pdf. self.matrix_approximation_rank = matrix_approximation_rank # Error feedback is usually crucial for both for convergence and generalization, # because PowerSGD is a biased compressor, @@ -97,8 +99,6 @@ def powerSGD_hook( bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors. Note that since DDP comm hook only supports single process single device mode at this time, only exactly one tensor is stored in this bucket. - matrix_approximation_rank (int): The low rank for matrix approximation. - Typically only 1 or 2 is used. See https://arxiv.org/pdf/1905.13727.pdf. Returns: Future handler of the communication, which updates the gradients in place. From 76d41c801eca14dbe9ba12399d27ef78ed0b642f Mon Sep 17 00:00:00 2001 From: James Reed Date: Fri, 11 Dec 2020 17:53:04 -0800 Subject: [PATCH 201/250] [JIT] Fix toIValue handling of AttributeError when casting ClassType (#49188) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49188 Test Plan: Imported from OSS Reviewed By: pbelevich Differential Revision: D25476573 Pulled By: jamesr66a fbshipit-source-id: cec296fae71cc0cdf36bde60417d7d3b1aa84198 --- test/jit/test_class_type.py | 20 ++++++++++++++++++++ test/jit/test_torchbind.py | 4 ++++ torch/csrc/jit/python/pybind_utils.h | 9 +++++++++ 3 files changed, 33 insertions(+) diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py index b4075dba14c8..a80670f0d22b 100644 --- a/test/jit/test_class_type.py +++ b/test/jit/test_class_type.py @@ -959,6 +959,26 @@ def forward(self, x): # Make sure class constant is accessible from module self.assertEqual(m.w, m_loaded.w) + def test_py_class_to_ivalue_missing_attribute(self): + global Foo # see [local resolution in python] + + class Foo(object): + i : int + f : float + + def __init__(self, i : int, f : float): + self.i = i + self.f = f + + @torch.jit.script + def test_fn(x : Foo) -> float: + return x.i + x.f + + test_fn(Foo(3, 4.0)) + + with self.assertRaisesRegex(RuntimeError, 'missing attribute i'): + test_fn(torch.rand(3, 4)) + def test_unused_method(self): """ Test unused methods on scripted classes. diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py index af7897e159b3..31eec81d480a 100644 --- a/test/jit/test_torchbind.py +++ b/test/jit/test_torchbind.py @@ -240,6 +240,10 @@ def forward(self): traced = torch.jit.trace(TryTracing(), ()) self.assertEqual(torch.zeros(4, 4), traced()) + def test_torchbind_pass_wrong_type(self): + with self.assertRaisesRegex(RuntimeError, 'missing attribute capsule'): + torch.ops._TorchScriptTesting.take_an_instance(torch.rand(3, 4)) + def test_torchbind_tracing_nested(self): class TryTracingNest(torch.nn.Module): def __init__(self): diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index dc3b3b13adef..34ca7585be67 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -713,6 +713,15 @@ inline IValue toIValue( const auto& attrType = classType->getAttribute(slot); const auto& attrName = classType->getAttributeName(slot); + if (!py::hasattr(obj, attrName.c_str())) { + throw py::cast_error(c10::str( + "Tried to cast object to type ", + type->repr_str(), + " but object", + " was missing attribute ", + attrName)); + } + const auto& contained = py::getattr(obj, attrName.c_str()); userObj->setSlot(slot, toIValue(contained, attrType)); } From 635f1cd1a57d10d381ff043689281ea578445744 Mon Sep 17 00:00:00 2001 From: Venkata Chintapalli Date: Fri, 11 Dec 2020 17:56:45 -0800 Subject: [PATCH 202/250] Enable LayerNorm test cases Summary: Remove Skip from test defs. Test Plan: https://our.intern.facebook.com/intern/testinfra/testrun/1407375060598951 Reviewed By: hyuen Differential Revision: D25513174 fbshipit-source-id: 0ddfd1713cf7b9daf25f6e62df92d682cade350f --- caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py index 36d6ba73e0c3..f992c6f9e1fc 100644 --- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py @@ -27,7 +27,7 @@ class LayerNorm(serial.SerializedTestCase): epsilon=st.floats(min_value=1e-4, max_value=1e-3), elementwise_affine=st.booleans()) @settings(deadline=datetime.timedelta(seconds=10)) - def Skip_test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine): + def test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace workspace.ResetWorkspace() @@ -142,7 +142,7 @@ def _layernorm_transform(self, X): elementwise_affine=st.booleans()) @settings(deadline=datetime.timedelta(seconds=10)) # re-enable when T74553975 gets fixed - def Skip_test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine): + def test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine): np.random.seed(seed) # Reset the workspace From 8d58362f59edb149fcee691ffca03ecdd94066fe Mon Sep 17 00:00:00 2001 From: Martin Yuan Date: Fri, 11 Dec 2020 18:49:27 -0800 Subject: [PATCH 203/250] [PyTorch] Remove native::zeros reference in TensorIndexing (#49117) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49117 Try to resolve part of the github issue of https://github.com/pytorch/pytorch/issues/48684 . It essentially calls the same functionality inside at::native::zeros(). After this diff, all references to aten::native symbols are removed. ghstack-source-id: 118261305 Test Plan: CI Reviewed By: dhruvbird Differential Revision: D25444940 fbshipit-source-id: 7f782680daa3aedd1b7301cb08576da2ec70c188 --- aten/src/ATen/TensorIndexing.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index a4c0a0b31c34..162efd1c6c8a 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -227,7 +227,7 @@ static inline Tensor applySelect( static inline Tensor boolToIndexingTensorCPUOrCUDA(const Tensor& self, bool value) { // booleans add a dimension of size 1. true indexes this dimension as if 0:, false as empty. if (value) { - return at::native::zeros({1}, {}, self.options().dtype(kLong)); + return at::empty({1}, {}, self.options().dtype(kLong)).fill_(0.); } else { return at::empty({0}, {}, self.options().dtype(kLong)); } From 693e9086561e6badadcb0aeda712a7300c876983 Mon Sep 17 00:00:00 2001 From: Chunli Fu Date: Fri, 11 Dec 2020 19:38:47 -0800 Subject: [PATCH 204/250] [shape inference] fix ConstantFill Test Plan: unit test Reviewed By: yinghai Differential Revision: D25326529 fbshipit-source-id: 1322635567f6661637cde90cadaac0197975e133 --- caffe2/opt/bound_shape_inference_test.cc | 24 ++++++++++++++++++++++++ caffe2/opt/bound_shape_inferencer.cc | 6 ++++++ 2 files changed, 30 insertions(+) diff --git a/caffe2/opt/bound_shape_inference_test.cc b/caffe2/opt/bound_shape_inference_test.cc index 95302ca5ccc4..f9c9b6acf034 100644 --- a/caffe2/opt/bound_shape_inference_test.cc +++ b/caffe2/opt/bound_shape_inference_test.cc @@ -270,6 +270,30 @@ TEST(BoundShapeInference, LengthsRangeFill) { TensorProto_DataType_INT32); } + +TEST(BoundShapeInference, ConstantFill) { + NetDef net; + net.add_op()->CopyFrom( + CreateOperatorDef("ConstantFill", "", {"X"}, {"Y"}, {})); + ShapeInfoMap shape_map; + BoundShapeSpec spec(20, 1000); + BoundShapeInferencer eng(spec); + shape_map.emplace( + "X", + makeTensorInfo( + {TensorBoundShape_DimType_BATCH, + TensorBoundShape_DimType_CONSTANT}, + {20, 1024})); + eng.InferBoundShapeAndType(net, shape_map, nullptr); + const auto& out_shape = eng.shape_info(); + verifyShapeInfo( + out_shape, + "Y", + {TensorBoundShape_DimType_BATCH, TensorBoundShape_DimType_CONSTANT}, + {20, 1024}, + TensorProto_DataType_FLOAT); +} + // https://github.com/pytorch/pytorch/issues/40861 TEST(BoundShapeInference, DISABLED_ON_WINDOWS(Reshape)) { NetDef net; diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc index c513c1a37b01..8ef5de06b02e 100644 --- a/caffe2/opt/bound_shape_inferencer.cc +++ b/caffe2/opt/bound_shape_inferencer.cc @@ -322,6 +322,12 @@ void BoundShapeInferencer::InferGivenTensorFill(const OperatorDef& op) { if (it != shape_info_.end()) { it->second.setDimType(std::vector( it->second.shape.dims_size(), TensorBoundShape_DimType_CONSTANT)); + if (op.type() == "ConstantFill" && op.input_size() >= 1) { + auto it_input = shape_info_.find(op.input(0)); + if (it_input != shape_info_.end()) { + it->second.setDimType(it_input->second.getDimType()); + } + } } } From b5b8fe98765412a2844dd481e76bc7f4bf5c334a Mon Sep 17 00:00:00 2001 From: Xiaodong Wang Date: Fri, 11 Dec 2020 21:08:36 -0800 Subject: [PATCH 205/250] Revert D25434956: [JIT] Use `is_buffer` in `BufferPolicy::valid` Test Plan: revert-hammer Differential Revision: D25434956 (https://github.com/pytorch/pytorch/commit/a480ca53028658ec32ee18183a40cda60304663a) Original commit changeset: ff2229058abb fbshipit-source-id: faba801e9b5e9fa0117624350518592868856eec --- torch/csrc/jit/api/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h index 1d96931f25fe..4b68a85c6696 100644 --- a/torch/csrc/jit/api/module.h +++ b/torch/csrc/jit/api/module.h @@ -508,7 +508,7 @@ struct TORCH_API BufferPolicy { } static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) { return typ->getAttribute(i)->isSubtypeOf(TensorType::get()) && - typ->is_buffer(i); + !typ->is_parameter(i); } static CONSTEXPR_EXCEPT_WIN_CUDA bool all_slots = false; }; From 8999915a8655ed9a7e08e04c3593ce2bf2614710 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 11 Dec 2020 22:16:50 -0800 Subject: [PATCH 206/250] Fix "Missing return statement" mypy error (#49276) Summary: Adds `return None` after `assert_never` in the inner `get_one` function Without it, TestTypeHints.test_run_mypy_strict using mypy 0.770 fails with the above mentioned error, see https://app.circleci.com/pipelines/github/pytorch/pytorch/249909/workflows/597d8e34-ff04-4efa-9dde-9e28fbded341/jobs/9557705 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49276 Reviewed By: jamesr66a Differential Revision: D25513658 Pulled By: malfet fbshipit-source-id: 318eaff7e0534b10eafe46c0b834b7f7cefea757 --- tools/codegen/gen.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 8c22c1fe702c..9ad4099d9196 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -468,6 +468,8 @@ def gen_one(f: NativeFunction) -> Optional[str]: return f'm.impl("{f.func.name}", {payload});' else: assert_never(self.target) + # Silence mypy's "Missing return statement" error + return None return list(mapMaybe(gen_one, g.functions())) From ae88d25c2393888cd0df04d880d08d2a4ee5c492 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 11 Dec 2020 22:34:54 -0800 Subject: [PATCH 207/250] [te] Fix clamp with uint8 args (#49143) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49143 Riddle me this, batman: how could `torch.clamp(torch.tensor([0], dtype=torch.uint8), -10, 10)` equal `10`? The answer: the min/max args are first cast to the dtype of the input, giving min=246 and max 10. Then you have to apply Min and Max in the right order: `Min(Max(in, min), max)`. Differ in any way and you're doomed. Hooray. This PR makes TE match eager mode for this operator, plus fixes a major facepalm in the llvm min/max codegen where we were always generating signed comparisons. ghstack-source-id: 118415318 Test Plan: `buck test //caffe2/test:{jit,tensorexpr}` Reviewed By: robieta Differential Revision: D25456366 fbshipit-source-id: dde3c26c2134bdbe803227601fa3d23eaac750fb --- test/test_jit_fuser_te.py | 3 +- torch/csrc/jit/tensorexpr/kernel.cpp | 32 ++++++++++++++-------- torch/csrc/jit/tensorexpr/kernel.h | 3 +- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 6 ++-- torch/csrc/jit/tensorexpr/types.cpp | 11 ++++++++ torch/csrc/jit/tensorexpr/types.h | 4 +++ 6 files changed, 42 insertions(+), 17 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index e125f473f6d8..956a115e6d56 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1259,8 +1259,7 @@ def apply(fn): torch.trunc, torch.frac, lambda x: torch.threshold(x, 0, -10), - # FIXME: fails on cpu with dtype=uint8 - # lambda x: torch.clamp(x, -10, 10), + lambda x: torch.clamp(x, -10, 10), ] sizes = [(1,), (2,), (4, 4)] for dtype, op, device, size in product(dtypes, unary_ops, self.devices, sizes): diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 88cf5761cfa1..f80bfc6745fa 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -726,7 +726,8 @@ Tensor* TensorExprKernel::computeThreeOperand( const torch::jit::Value* v, const std::function< ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>& - innerExpr) { + innerExpr, + bool promote_inputs) { auto const& n = v->node(); std::vector> shapes; for (size_t idx = 0; idx < 3; idx++) { @@ -737,7 +738,7 @@ Tensor* TensorExprKernel::computeThreeOperand( return Compute( name, c10::fmap(shape), - [this, v, innerExpr](const std::vector& axes) { + [this, v, innerExpr, promote_inputs](const std::vector& axes) { auto const& n = v->node(); std::vector indices(axes.begin(), axes.end()); std::vector inputs = { @@ -746,7 +747,9 @@ Tensor* TensorExprKernel::computeThreeOperand( tensorOrConstant(n->inputs()[2], indices), }; - promoteInputs(inputs); + if (promote_inputs) { + promoteInputs(inputs); + } ExprHandle compute = innerExpr(inputs[0], inputs[1], inputs[2]); return demoteOutput(compute, n->output()); }); @@ -976,21 +979,26 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { const ExprHandle& in, const ExprHandle& min, const ExprHandle& max) { + auto cast = [&](const ExprHandle& e) { + return Cast::make(in.dtype(), e); + }; + if (noMin && noMax) { return in; } else if (noMin) { - return CompareSelect::make(in, max, max, in, kGT); + auto cmax = cast(max); + return CompareSelect::make(in, cmax, cmax, in, kGT); } else if (noMax) { - return CompareSelect::make(in, min, min, in, kLT); + auto cmin = cast(min); + return CompareSelect::make(in, cmin, cmin, in, kLT); } else { - return CompareSelect::make( - in, - min, - min, - CompareSelect::make(in, max, max, in, kGT), - kLT); + auto cmax = cast(max); + auto cmin = cast(min); + auto mm = CompareSelect::make(in, cmin, cmin, in, kLT); + return CompareSelect::make(mm, cmax, cmax, mm, kGT); } - }); + }, + false /* promote_inputs */); } break; case aten::sigmoid: { diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index 8fcce23717d3..c969669e63d3 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -116,7 +116,8 @@ class TORCH_API TensorExprKernel { const torch::jit::Value* v, const std::function< ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>& - innerExpr); + innerExpr, + bool promote_inputs = true); Tensor* computeConditionWithTwoOperand( const std::string& name, diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index d469a39cf69d..8dccb66b52f4 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -721,7 +721,8 @@ void LLVMCodeGenImpl::visit(const Max* v) { auto rhs = this->value_; if (v->dtype().is_integral()) { - auto icmp = irb_.CreateICmpSGT(lhs, rhs); + auto icmp = v->dtype().is_signed() ? irb_.CreateICmpSGT(lhs, rhs) + : irb_.CreateICmpUGT(lhs, rhs); value_ = irb_.CreateSelect(icmp, lhs, rhs); return; } @@ -742,7 +743,8 @@ void LLVMCodeGenImpl::visit(const Min* v) { v->rhs()->accept(this); auto rhs = this->value_; if (v->dtype().is_integral()) { - auto icmp = irb_.CreateICmpSLT(lhs, rhs); + auto icmp = v->dtype().is_signed() ? irb_.CreateICmpSLT(lhs, rhs) + : irb_.CreateICmpULT(lhs, rhs); value_ = irb_.CreateSelect(icmp, lhs, rhs); return; } diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp index f7aa96be4c45..0298950cbf37 100644 --- a/torch/csrc/jit/tensorexpr/types.cpp +++ b/torch/csrc/jit/tensorexpr/types.cpp @@ -9,6 +9,10 @@ namespace torch { namespace jit { namespace tensorexpr { +static bool is_c10_type(const ScalarType& type) { + return type < ScalarType::Undefined; +} + bool is_integral(const ScalarType& type) { switch (type) { case ScalarType::Bool: @@ -38,6 +42,13 @@ bool is_floating_point(const ScalarType& type) { return false; } +bool is_signed(const ScalarType& type) { + if (is_c10_type(type)) { + return c10::isSignedType(static_cast(type)); + } + return false; +} + Dtype Dtype::scalar_dtype() const { return ToDtype(scalar_type_); } diff --git a/torch/csrc/jit/tensorexpr/types.h b/torch/csrc/jit/tensorexpr/types.h index 3e8ec36ec2f3..29ccf06ef035 100644 --- a/torch/csrc/jit/tensorexpr/types.h +++ b/torch/csrc/jit/tensorexpr/types.h @@ -37,6 +37,7 @@ TORCH_API std::ostream& operator<<( TORCH_API bool is_integral(const ScalarType& type); TORCH_API bool is_floating_point(const ScalarType& type); +TORCH_API bool is_signed(const ScalarType& type); // Data types for scalar and vector elements. class TORCH_API Dtype { @@ -75,6 +76,9 @@ class TORCH_API Dtype { bool is_floating_point() const { return tensorexpr::is_floating_point(scalar_type_); } + bool is_signed() const { + return tensorexpr::is_signed(scalar_type_); + } Dtype cloneWithScalarType(ScalarType nt) const { return Dtype(nt, lanes_); From eaac28192c2bd5d6a05e794d38c45d9bbdf4e61a Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 11 Dec 2020 22:34:54 -0800 Subject: [PATCH 208/250] [te] Use Dtype::is_signed instead of an ad hoc local predicate. (#49147) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49147 D25456366 adds Dtype::is_signed (which is backed by c10::isSignedType), so use that instead of this one-off. ghstack-source-id: 118415315 Test Plan: ``` buck test //caffe2/test{:jit,:tensorexpr,/cpp/tensorexpr:tensorexpr} ``` Reviewed By: robieta Differential Revision: D25456683 fbshipit-source-id: 428f1e8bff21ea05730690226a44984995c4c138 --- torch/csrc/jit/tensorexpr/llvm_codegen.cpp | 30 +++++++--------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp index 8dccb66b52f4..f4fd647be3af 100644 --- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp +++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp @@ -48,18 +48,6 @@ namespace jit { namespace tensorexpr { namespace { -bool is_unsigned_integral(const ScalarType& type) { - switch (type) { - case ScalarType::Bool: - case ScalarType::Byte: - return true; - default: - return false; - } - - return false; -} - llvm::CmpInst::Predicate llvm_comparison_predicate( CompareSelectOperation compare_op, const ScalarType& type) { @@ -69,17 +57,17 @@ llvm::CmpInst::Predicate llvm_comparison_predicate( case CompareSelectOperation::kNE: return llvm::ICmpInst::ICMP_NE; case CompareSelectOperation::kGT: - return is_unsigned_integral(type) ? llvm::ICmpInst::ICMP_UGT - : llvm::ICmpInst::ICMP_SGT; + return is_signed(type) ? llvm::ICmpInst::ICMP_SGT + : llvm::ICmpInst::ICMP_UGT; case CompareSelectOperation::kGE: - return is_unsigned_integral(type) ? llvm::ICmpInst::ICMP_UGE - : llvm::ICmpInst::ICMP_SGE; + return is_signed(type) ? llvm::ICmpInst::ICMP_SGE + : llvm::ICmpInst::ICMP_UGE; case CompareSelectOperation::kLT: - return is_unsigned_integral(type) ? llvm::ICmpInst::ICMP_ULT - : llvm::ICmpInst::ICMP_SLT; + return is_signed(type) ? llvm::ICmpInst::ICMP_SLT + : llvm::ICmpInst::ICMP_ULT; case CompareSelectOperation::kLE: - return is_unsigned_integral(type) ? llvm::ICmpInst::ICMP_ULE - : llvm::ICmpInst::ICMP_SLE; + return is_signed(type) ? llvm::ICmpInst::ICMP_SLE + : llvm::ICmpInst::ICMP_ULE; default: // TODO: change to a proper error report throw std::runtime_error("invalid operator type"); @@ -1673,7 +1661,7 @@ void LLVMCodeGenImpl::visit(const Intrinsics* v) { } else if (v->dtype().is_integral() && v->op_type() == kFabs) { // abs is only intrinsic defined for integer inputs in pytorch eager v->params().front()->accept(this); - if (is_unsigned_integral(v->dtype().scalar_type())) { + if (!v->dtype().is_signed()) { return; } // TODO: use llvm.abs intrinsic for LLVM 12 From dc92f25b383ee4480ca1a87780482d5c7e8ac926 Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Fri, 11 Dec 2020 22:34:54 -0800 Subject: [PATCH 209/250] [te] Use c10::ScalarType utility functions in te::Dtype (#49148) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49148 Instead of defining our own variants. I'm pretty sure this fixes a bug too, in that Bfloat16 wasn't being considered FP. Otoh, I don't think it's possible to create TEs with Bfloat so... ghstack-source-id: 118415314 Test Plan: `buck test //caffe2/test:jit` Reviewed By: robieta Differential Revision: D25456767 fbshipit-source-id: bd5822114b76c4fde82f566308909bd2a55f4f21 --- torch/csrc/jit/tensorexpr/types.cpp | 36 ++++++++--------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp index 0298950cbf37..ae9bdcf1986c 100644 --- a/torch/csrc/jit/tensorexpr/types.cpp +++ b/torch/csrc/jit/tensorexpr/types.cpp @@ -14,39 +14,21 @@ static bool is_c10_type(const ScalarType& type) { } bool is_integral(const ScalarType& type) { - switch (type) { - case ScalarType::Bool: - case ScalarType::Byte: - case ScalarType::Char: - case ScalarType::Short: - case ScalarType::Int: - case ScalarType::Long: - return true; - default: - return false; - } - - return false; + return is_c10_type(type) + ? c10::isIntegralType(static_cast(type), true) + : false; } bool is_floating_point(const ScalarType& type) { - switch (type) { - case ScalarType::Half: - case ScalarType::Float: - case ScalarType::Double: - return true; - default: - return false; - } - - return false; + return is_c10_type(type) + ? c10::isFloatingType(static_cast(type)) + : false; } bool is_signed(const ScalarType& type) { - if (is_c10_type(type)) { - return c10::isSignedType(static_cast(type)); - } - return false; + return is_c10_type(type) + ? c10::isSignedType(static_cast(type)) + : false; } Dtype Dtype::scalar_dtype() const { From 717f31d9846f9b7707f9b1ab6076e1353f399119 Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Fri, 11 Dec 2020 23:41:55 -0800 Subject: [PATCH 210/250] Remove unused reconstruct_scopes function (#48822) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48822 Test Plan: Imported from OSS Reviewed By: ZolotukhinM Differential Revision: D25325012 Pulled By: cccclai fbshipit-source-id: 86ea4c0b2926257c0f82aa05cbcd83278b1b67f7 --- test/test_jit.py | 50 ----- tools/build_variables.bzl | 1 - torch/csrc/jit/passes/reconstruct_scopes.cpp | 206 ------------------ torch/csrc/jit/passes/reconstruct_scopes.h | 37 ---- torch/csrc/jit/python/init.cpp | 11 - .../csrc/jit/serialization/export_module.cpp | 1 - 6 files changed, 306 deletions(-) delete mode 100644 torch/csrc/jit/passes/reconstruct_scopes.cpp delete mode 100644 torch/csrc/jit/passes/reconstruct_scopes.h diff --git a/test/test_jit.py b/test/test_jit.py index 239e4660674b..3a3e87d49e82 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -1248,56 +1248,6 @@ def forward(self, x): FileCheck().check("my::matched_conv_bn").run(m._c._get_method("forward").graph) - def test_reconstruct_scopes(self): - class SubModule(torch.nn.Module): - def __init__(self): - super(SubModule, self).__init__() - - def bar(self, x): - return x + x - - def forward(self, x): - return x * self.bar(x) - - class MyModule(torch.nn.Module): - def __init__(self): - super(MyModule, self).__init__() - self.sub = SubModule() - - def forward(self, x): - return self.sub(x) + x - - traced = torch.jit.trace(MyModule(), torch.zeros(1)) - g = traced.graph - torch._C._jit_pass_inline(g) - torch._C._jit_pass_reconstruct_scopes(traced._c, g) - FileCheck().check("scope: top(MyModule).sub(SubModule).forward").run(g) - - def test_reconstruct_scopes_duplicated_class_types(self): - class SubModule(torch.nn.Module): - def __init__(self): - super(SubModule, self).__init__() - - def forward(self, x): - return x + 2 - - class MyModule(torch.nn.Module): - def __init__(self): - super(MyModule, self).__init__() - self.sub1 = SubModule() - self.sub2 = SubModule() - - def forward(self, x): - return self.sub1(x) + self.sub2(x) - - traced = torch.jit.trace(MyModule(), torch.zeros(1)) - g = traced.graph - torch._C._jit_pass_inline(g) - torch._C._jit_pass_reconstruct_scopes(traced._c, g) - FileCheck().check_dag("scope: top(MyModule).sub1(SubModule).forward") \ - .check_dag("scope: top(MyModule).sub2(SubModule).forward") \ - .run(g) - def test_expand_quantlint(self): pass diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 8b6374e9d71c..6c9ad0d5d6e1 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -176,7 +176,6 @@ core_sources_full_mobile = [ "torch/csrc/jit/passes/erase_number_types.cpp", "torch/csrc/jit/passes/fixup_trace_scope_blocks.cpp", "torch/csrc/jit/passes/freeze_module.cpp", - "torch/csrc/jit/passes/reconstruct_scopes.cpp", "torch/csrc/jit/passes/fuse_linear.cpp", "torch/csrc/jit/passes/fuse_relu.cpp", "torch/csrc/jit/passes/graph_fuser.cpp", diff --git a/torch/csrc/jit/passes/reconstruct_scopes.cpp b/torch/csrc/jit/passes/reconstruct_scopes.cpp deleted file mode 100644 index 15aa5863fbf1..000000000000 --- a/torch/csrc/jit/passes/reconstruct_scopes.cpp +++ /dev/null @@ -1,206 +0,0 @@ -#include -#include - -namespace torch { -namespace jit { - -class ReconstructScopesPass { - public: - ReconstructScopesPass(const Module& m, Graph& g, std::string p) - : root_module_(m), - graph_(g), - prefix_(std::move(p)), - class_types_are_not_unique_(false){}; - void run(); - - private: - const Module& root_module_; - Graph& graph_; - std::string prefix_; - - // This boolean indicates whether there are two submodules of the same - // class type. This issue may occur in a scripted module and make it - // difficult to exactly track module information corresponding to each - // Node* after inlining the graph. Consider the following example: - - // class A(nn.Module): - // def __init__(self): - // super(A, self).__init__() - - // def forward(self, x): - // return x + 1 - - // class B(nn.Module): - // def __init__(self): - // super(B, self).__init__() - // self.A0 = A() - // self.A1 = A() - - // def forward(self, x): - // return self.A0(x) + self.A1(x) - - // m_traced = torch.jit.trace(B(), torch.Tensor([1])) - // m_scripted = torch.jit.script(B()) - - // In m_traced, self.A0 and self.A1 have different class types, but in - // m_scripted, self.A0 and self.A1 have the same class types. Therefore, - // it is difficult to distinguish 'A0' and 'A1' in the module hierarchy - // after the graph is inlined. In this case, we add a warning to let - // users know that the debugging information may be incomplete. - bool class_types_are_not_unique_; - - std::unordered_map func_to_module_; - std::unordered_map module_names_; - - void visitBlock(Block* b, const std::string& root_scope_string); - void visitNode(Node* n, const std::string& root_scope_string); - - std::string getModuleTypeName( - const Module& module, - const std::string& prefix); - void constructFunctionToModuleMap(const Module& module); - void constructRelativeNamesForModules( - const Module& module, - const std::string& prefix); - - std::string getScopeString(const InlinedCallStackEntry& frame) const; - - void appendSourceRangeInfo( - std::string& scopeString, - const InlinedCallStackEntry& frame) const; -}; - -void ReconstructScopesPass::constructFunctionToModuleMap(const Module& module) { - for (const auto& method : module.get_methods()) { - Function* func_ptr = &method.function(); - if (!class_types_are_not_unique_ && - func_to_module_.find(func_ptr) != func_to_module_.end()) { - class_types_are_not_unique_ = true; - } - func_to_module_[func_ptr] = module._ivalue(); - } - for (const Module& m : module.children()) { - constructFunctionToModuleMap(m); - } -} - -std::string ReconstructScopesPass::getModuleTypeName( - const Module& module, - const std::string& prefix) { - std::string moduleType = module.type()->str(); - size_t lastDotIndex = moduleType.rfind('.'); - if (lastDotIndex != std::string::npos) { - moduleType = moduleType.substr(lastDotIndex + 1); - } - return prefix + "(" + moduleType + ")"; -} - -void ReconstructScopesPass::constructRelativeNamesForModules( - const Module& module, - const std::string& prefix) { - module_names_[module._ivalue()] = getModuleTypeName(module, prefix); - for (const NameModule& s : module.named_children()) { - constructRelativeNamesForModules( - s.value, module_names_[module._ivalue()] + "." + s.name); - } -} - -void ReconstructScopesPass::appendSourceRangeInfo( - std::string& scopeString, - const InlinedCallStackEntry& frame) const { - SourceRange r = std::get<1>(frame); - if (r.source()) { - if (auto orig = r.source()->findSourceRangeThatGenerated(r)) { - r = *orig; - } - } - if (auto file_line_col = r.file_line_col()) { - std::string filename; - size_t line, col; - std::tie(filename, line, col) = *file_line_col; - scopeString += "<" + filename + ":" + c10::to_string(line) + ":" + - c10::to_string(col) + ">"; - } -} - -std::string ReconstructScopesPass::getScopeString( - const InlinedCallStackEntry& frame) const { - Function* f = std::get<0>(frame); - if (!func_to_module_.count(f)) { - return ""; - } - auto m = func_to_module_.at(f); - if (!module_names_.count(m)) { - return ""; - } - std::string scopeString = module_names_.at(m) + "." + f->name(); - - // When class types are not unique, the module information may be - // incomplele. In this case, we add source range information, - // which can be helpful for debugging purposes. - if (class_types_are_not_unique_) { - appendSourceRangeInfo(scopeString, frame); - } - return scopeString; -} - -void ReconstructScopesPass::visitNode( - Node* n, - const std::string& root_scope_string) { - for (Block* b : n->blocks()) { - visitBlock(b, root_scope_string); - } - ScopePtr sc = c10::make_intrusive(); - if (!n->callstack()) { - sc = sc->push(Symbol::scope(root_scope_string)); - } else { - for (const auto& frame : (*n->callstack())->vec()) { - auto name = getScopeString(frame); - GRAPH_UPDATE("Adding a scope ", name, " for node ", *n); - sc = sc->push(Symbol::scope(name)); - } - } - n->setScope(sc); - GRAPH_UPDATE("Updated node: ", *n); -} - -void ReconstructScopesPass::visitBlock( - Block* b, - const std::string& root_scope_string) { - for (Node* n : b->nodes()) { - visitNode(n, root_scope_string); - } -} - -void ReconstructScopesPass::run() { - GRAPH_DUMP("Graph before reconstructing scope", &graph_); - func_to_module_.clear(); - module_names_.clear(); - - constructFunctionToModuleMap(root_module_); - constructRelativeNamesForModules(root_module_, prefix_); - - if (class_types_are_not_unique_) { - TORCH_WARN( - "It seems that the module contain two instances of the same class type.\n", - "The current debugging program has not provided support for distinguishing ", - "the two instances of the same class type.\n", - "The module debugging information may be incomplete."); - } - - std::string root_scope_string = - getModuleTypeName(root_module_, prefix_) + ".forward"; - visitBlock(graph_.block(), root_scope_string); - GRAPH_DUMP("Graph after reconstructing scope", &graph_); -} - -void ReconstructScopes( - const Module& module, - Graph& g, - const std::string& prefix = "top") { - ReconstructScopesPass p(module, g, prefix); - p.run(); -} - -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/passes/reconstruct_scopes.h b/torch/csrc/jit/passes/reconstruct_scopes.h deleted file mode 100644 index b08655cb3741..000000000000 --- a/torch/csrc/jit/passes/reconstruct_scopes.h +++ /dev/null @@ -1,37 +0,0 @@ -/** \brief A pass to reconstruct scopes of nodes from their inline callstacks. - * - * The pass takes the root module and a graph and for every graph node with - * non-empty inline call-stack it computes the scope from this callstack. - * - * Callstack can be thought of as a stack of pointers to Function, and Function - * in a general case may not be a part of any module. That's why this pass - * requires a root module to be passed in - we can traverse all methods of the - * module and its submodules and then recognize these methods in callstacks. - * - * Scope can be thought of as a stack of strings, so we basically converting a - * pointer to Function to a string, or in other words trying to find a name for - * a function in this module hierarchy. - * - * The produced scopes look like: - * top.submod1.function1/top.submod1.subsubmod1.function2 - * - * 'top' is the name we use for the root module itself, and it can be customized - * with an optional third argument of the pass. - * - * The pass would not change anything if inlining has not been run on the graph. - */ -#pragma once - -#include -#include - -namespace torch { -namespace jit { - -TORCH_API void ReconstructScopes( - const Module& module, - Graph& g, - const std::string& prefix); - -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 5f88a8a6c79d..663c9ab06a52 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include @@ -340,16 +339,6 @@ void initJITBindings(PyObject* module) { subgraph_rewriter.RegisterRewritePattern(pattern, fused_node_name); subgraph_rewriter.runOnGraph(g); }) - .def( - "_jit_pass_reconstruct_scopes", - [](script::Module& module, - std::shared_ptr& g, - const std::string& prefix) { - ReconstructScopes(module, *g, prefix); - }, - py::arg("module"), - py::arg("graph"), - py::arg("prefix") = "top") .def( "_jit_pass_remove_inplace_ops", [](const std::shared_ptr& g) { return RemoveInplaceOps(g); }) diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp index e9f9d27bf166..3291285b90dd 100644 --- a/torch/csrc/jit/serialization/export_module.cpp +++ b/torch/csrc/jit/serialization/export_module.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include From cd927875e0c9e7c4b84b81cefbe2a209ce0a673c Mon Sep 17 00:00:00 2001 From: Hao Lu Date: Sat, 12 Dec 2020 00:49:52 -0800 Subject: [PATCH 211/250] [pt] Replace size(dim) with sizes()[dim] (#49255) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49255 - Replace `size(dim)` with `sizes()[dim]` because `sizes()` does not go through the dispatcher and is marginally better. - Remove unnecessary `size(dim)` and `sizes()` calls by saving the return value of `sizes()` to a temporary var. Reviewed By: radkris-git Differential Revision: D25488129 fbshipit-source-id: 4039e0609df20d5888666a71ad93b15e9a2182c5 --- .../native/quantized/cpu/qembeddingbag.cpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index 28f4c6a6eceb..c4e92dd039e2 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -45,11 +45,12 @@ at::Tensor embedding_bag_4bit_impl( } } - const int64_t N = weight.size(0); - const int64_t weight_size = weight.size(1); + const auto weight_sizes = weight.sizes(); + const int64_t N = weight_sizes[0]; + const int64_t weight_size = weight_sizes[1]; const int64_t D = (weight_size - 4) * 2; // NB: 2-byte fp16 scale and 2-byte zero_offset - const int64_t M = offsets.size(0); + const int64_t M = offsets.sizes()[0]; int64_t output_size = M - 1; std::vector offsets_include_last_val; @@ -231,9 +232,10 @@ at::Tensor embedding_bag_byte_impl( } } - const int64_t N = weight.size(0); - const int64_t D = weight.size(1) - 8; // NB: -8 to account for scale and bias - const int64_t M = offsets.size(0); + const auto weight_sizes = weight.sizes(); + const int64_t N = weight_sizes[0]; + const int64_t D = weight_sizes[1] - 8; // NB: -8 to account for scale and bias + const int64_t M = offsets.sizes()[0]; int64_t output_size = M - 1; std::vector offsets_include_last_val; @@ -254,7 +256,8 @@ at::Tensor embedding_bag_byte_impl( } std::vector shape; if (indices.dim() == 2 && is_embedding_op) { - shape = {indices.size(0), indices.size(1), D}; + const auto indices_sizes = indices.sizes(); + shape = {indices_sizes[0], indices_sizes[1], D}; } else { shape = {output_size, D}; } @@ -350,8 +353,8 @@ at::Tensor embedding_bag_byte_helper( !offsets_in.has_value(), "embedding_bag_byte operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences."); - offsets = - at::arange(0, indices.numel(), indices.size(1), indices.scalar_type()); + offsets = at::arange( + 0, indices.numel(), indices.sizes()[1], indices.scalar_type()); } else { TORCH_CHECK( offsets_in.has_value(), @@ -443,8 +446,8 @@ at::Tensor embedding_bag_4bit_helper( !offsets_in.has_value(), "embedding_bag_4bit operator: input is 2D, then offsets has to be None, as input is treated is a mini-batch of fixed length sequences."); - offsets = - at::arange(0, indices.numel(), indices.size(1), indices.scalar_type()); + offsets = at::arange( + 0, indices.numel(), indices.sizes()[1], indices.scalar_type()); } else { TORCH_CHECK( offsets_in.has_value(), From 33b7970d9e5ccd7f9a097ac2d951eecfb82a58cc Mon Sep 17 00:00:00 2001 From: Alexander Golynski Date: Sat, 12 Dec 2020 06:46:55 -0800 Subject: [PATCH 212/250] fix slow windows test (#49258) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49258 Tested by adding `time.sleep(3) ` in SubProcess.run and see test print "test_inherit_tensor: SubProcess too slow" Sample failure: https://app.circleci.com/pipelines/github/pytorch/pytorch/249756/workflows/3605479e-1020-4325-9a4c-8bde5ae38262/jobs/9550663 Test Plan: Imported from OSS Reviewed By: supriyar Differential Revision: D25507209 Pulled By: agolynski fbshipit-source-id: ec808f0f658d0fb4c8447f68ec5ceba2aa66b1b5 --- test/test_multiprocessing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py index 49e0a3cb45c0..75b486043c42 100644 --- a/test/test_multiprocessing.py +++ b/test/test_multiprocessing.py @@ -368,8 +368,11 @@ def test_inherit_tensor(self): t = torch.zeros(5, 5) p = SubProcess(t.share_memory_()) p.start() - p.join(1) - self.assertEqual(t, torch.ones(5, 5) * 3, atol=0, rtol=0) + p.join(2) + if p.exitcode is None: + print("test_inherit_tensor: SubProcess too slow") + else: + self.assertEqual(t, torch.ones(5, 5) * 3, atol=0, rtol=0) @unittest.skipIf(IS_WINDOWS, "Test needs to use fork multiprocessing") def test_autograd_errors(self): From dc4db95540da06623c747bf0f2bf9f4af3d2925a Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Sat, 12 Dec 2020 17:10:50 -0800 Subject: [PATCH 213/250] Update pipeline API to accept arbitrary sequence of Tensors and not just Tuple (#48467) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48467 The current API's forward method only accepted a Tensor or a Tuple of Tensors, making this more generic by accepting any Sequence of Tensors. ghstack-source-id: 118436340 Test Plan: waitforbuildbot Reviewed By: rohan-varma Differential Revision: D25181944 fbshipit-source-id: 4db251dad52c01abc69f3d327788f2e4289e6c9d --- test/distributed/_pipeline/sync/test_pipe.py | 54 ++++++++++++++++++- .../_pipeline/sync/balance/__init__.py | 4 +- .../_pipeline/sync/balance/profile.py | 4 +- .../distributed/_pipeline/sync/checkpoint.py | 13 ++++- torch/distributed/_pipeline/sync/copy.py | 4 +- .../distributed/_pipeline/sync/microbatch.py | 11 ++-- torch/distributed/_pipeline/sync/pipe.py | 8 +-- torch/distributed/_pipeline/sync/pipeline.py | 4 +- .../_pipeline/sync/skip/skippable.py | 3 +- 9 files changed, 84 insertions(+), 21 deletions(-) diff --git a/test/distributed/_pipeline/sync/test_pipe.py b/test/distributed/_pipeline/sync/test_pipe.py index 8b87fa3d31f6..ad00d9ffc0d8 100644 --- a/test/distributed/_pipeline/sync/test_pipe.py +++ b/test/distributed/_pipeline/sync/test_pipe.py @@ -258,6 +258,31 @@ def forward(self, x): assert counter == 2 +def test_nested_input(setup_rpc): + class NestedInput(nn.Module): + def __init__(self): + super().__init__() + self.fc_a = nn.Linear(1, 1) + self.fc_b = nn.Linear(1, 1) + + def forward(self, inp): + return inp + + model = nn.Sequential(NestedInput()) + model = Pipe(model, chunks=2) + + a = torch.rand(10, 1, requires_grad=True) + b = torch.rand(10, 1, requires_grad=True) + + # TypeError: expected Tensor, but got tuple + with pytest.raises(TypeError): + model((a, (a, b))).local_value() + + # TypeError: expected Tensor, but got list + with pytest.raises(TypeError): + model((a, [a, b])).local_value() + + def test_input_pair(setup_rpc): class Two(nn.Module): def __init__(self): @@ -282,6 +307,17 @@ def forward(self, a_and_b): assert a.grad is not None assert b.grad is not None + # Test with list. + a.grad = None + b.grad = None + a_out, b_out = model([a, b]).local_value() + loss = (a_out + b_out).mean() + loss.backward() + + assert a.grad is not None + assert b.grad is not None + + def test_input_singleton(setup_rpc): class One(nn.Module): @@ -305,6 +341,18 @@ def forward(self, only_a): assert all(p.grad is not None for p in model.parameters()) assert a.grad is not None + # Test with list + a.grad = None + for p in model.parameters(): + p.grad = None + + (a_out,) = model([a]).local_value() + loss = a_out.mean() + loss.backward() + + assert all(p.grad is not None for p in model.parameters()) + assert a.grad is not None + def test_input_varargs(setup_rpc): model = nn.Sequential(nn.Linear(1, 1)) @@ -336,7 +384,7 @@ def forward(self, _): model("hello") -def test_non_tensor_tuple(setup_rpc): +def test_non_tensor_sequence(setup_rpc): class NonTensorTuple(nn.Module): def forward(self, x): return (x, "hello") @@ -353,6 +401,10 @@ def forward(self, x): with pytest.raises(TypeError): model((x, "hello")) + # TypeError: expected Tensor to scatter, but got str + with pytest.raises(TypeError): + model([x, "hello"]) + @pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"]) def test_deferred_batch_norm(checkpoint, setup_rpc): diff --git a/torch/distributed/_pipeline/sync/balance/__init__.py b/torch/distributed/_pipeline/sync/balance/__init__.py index 15aa53bc1a2c..8c6da586657f 100644 --- a/torch/distributed/_pipeline/sync/balance/__init__.py +++ b/torch/distributed/_pipeline/sync/balance/__init__.py @@ -18,7 +18,7 @@ pipe = Pipe(model, balance, chunks=8) """ -from typing import List, Tuple, Union +from typing import List, Union, Sequence import torch from torch import Tensor @@ -32,7 +32,7 @@ Device = Union[torch.device, int, str] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] diff --git a/torch/distributed/_pipeline/sync/balance/profile.py b/torch/distributed/_pipeline/sync/balance/profile.py index 737dda60f6fa..382da988e808 100644 --- a/torch/distributed/_pipeline/sync/balance/profile.py +++ b/torch/distributed/_pipeline/sync/balance/profile.py @@ -7,7 +7,7 @@ """Per-layer profilers.""" import copy import time -from typing import Generator, List, Tuple, Union +from typing import Generator, List, Union, Sequence import torch from torch import Tensor @@ -20,7 +20,7 @@ Device = Union[torch.device, int, str] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] diff --git a/torch/distributed/_pipeline/sync/checkpoint.py b/torch/distributed/_pipeline/sync/checkpoint.py index bad5eec19469..3f9240793183 100644 --- a/torch/distributed/_pipeline/sync/checkpoint.py +++ b/torch/distributed/_pipeline/sync/checkpoint.py @@ -27,7 +27,16 @@ from collections import deque from contextlib import contextmanager import threading -from typing import TYPE_CHECKING, Deque, Generator, List, Optional, Tuple, Union +from typing import ( + TYPE_CHECKING, + Deque, + Generator, + List, + Optional, + Union, + Sequence, + Tuple +) import torch from torch import Tensor @@ -40,7 +49,7 @@ __all__ = ["is_checkpointing", "is_recomputing"] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] # Types for shared memory between Checkpoint and Recompute. diff --git a/torch/distributed/_pipeline/sync/copy.py b/torch/distributed/_pipeline/sync/copy.py index 3d330f59eeee..07e71a87ce08 100644 --- a/torch/distributed/_pipeline/sync/copy.py +++ b/torch/distributed/_pipeline/sync/copy.py @@ -8,7 +8,7 @@ and computation on the same GPU. """ from collections import deque -from typing import Deque, List, Optional, Tuple +from typing import Deque, List, Optional, Tuple, Sequence import torch from torch import Tensor @@ -18,7 +18,7 @@ __all__: List[str] = [] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] # Common interface between :class:`Copy` and :class:`Wait`. diff --git a/torch/distributed/_pipeline/sync/microbatch.py b/torch/distributed/_pipeline/sync/microbatch.py index d38cb6d3b85c..fc4daf7a9b42 100644 --- a/torch/distributed/_pipeline/sync/microbatch.py +++ b/torch/distributed/_pipeline/sync/microbatch.py @@ -6,7 +6,7 @@ # LICENSE file in the root directory of this source tree. """Manipulation of micro-batches.""" import typing -from typing import Callable, Iterable, Iterator, List, Tuple, Union, cast +from typing import Callable, Iterable, Iterator, List, Union, cast, Sequence import torch from torch import Tensor @@ -15,7 +15,7 @@ __all__: List[str] = [] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] Function = Callable[[TensorOrTensors], TensorOrTensors] @@ -110,7 +110,7 @@ def __setitem__(self, index: Union[int, slice], value: TensorOrTensors) -> None: def _setitem_by_index(self, index: int, value: Tensor) -> None: if not self.atomic: i = index - self.value = self.value[:i] + (value,) + self.value[i + 1 :] + self.value = self.value[:i] + (value,) + self.value[i + 1 :] # type: ignore return if index != 0: @@ -139,9 +139,10 @@ def check(input: TensorOrTensors) -> None: TypeError: input is not a tensor or tensors. """ - if isinstance(input, tuple): + if isinstance(input, Sequence): for x in input: - check(x) + if not isinstance(x, Tensor): + raise TypeError(f"expected Tensor, but got {input.__class__.__name__}") return if not isinstance(input, Tensor): diff --git a/torch/distributed/_pipeline/sync/pipe.py b/torch/distributed/_pipeline/sync/pipe.py index a097e8aa1a9e..82db93060d91 100644 --- a/torch/distributed/_pipeline/sync/pipe.py +++ b/torch/distributed/_pipeline/sync/pipe.py @@ -6,7 +6,7 @@ # LICENSE file in the root directory of this source tree. """The Pipe interface.""" from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast +from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union, cast, Sequence import torch from torch import Tensor, nn @@ -27,7 +27,7 @@ Device = Union[torch.device, int, str] Devices = Union[Iterable[Device], List[Device]] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] if TYPE_CHECKING: @@ -310,11 +310,11 @@ def forward(self, input: TensorOrTensors) -> RRef[TensorOrTensors]: # type: ign """:class:`Pipe` is a fairly transparent module wrapper. It doesn't modify the input and output signature of the underlying module. But there's type restriction. Input and output have to be a - :class:`~torch.Tensor` or a tuple of tensors. This restriction is + :class:`~torch.Tensor` or a sequence of tensors. This restriction is applied at partition boundaries too. Args: - input (torch.Tensor or Tuple[torch.Tensor, ...]): input mini-batch + input (torch.Tensor or Sequence[torch.Tensor]): input mini-batch Returns: :class:`~torch.distributed.rpc.RRef` to the output of the mini-batch diff --git a/torch/distributed/_pipeline/sync/pipeline.py b/torch/distributed/_pipeline/sync/pipeline.py index 86c8dfddebeb..72c04c6f28d0 100644 --- a/torch/distributed/_pipeline/sync/pipeline.py +++ b/torch/distributed/_pipeline/sync/pipeline.py @@ -7,7 +7,7 @@ """The pipeline parallelism of Pipe.""" from queue import Queue from types import TracebackType -from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Type, Union, cast +from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Type, Union, cast, Sequence import torch from torch import Tensor, nn @@ -25,7 +25,7 @@ __all__: List[str] = [] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] ExcInfo = Tuple[Type[BaseException], BaseException, TracebackType] diff --git a/torch/distributed/_pipeline/sync/skip/skippable.py b/torch/distributed/_pipeline/sync/skip/skippable.py index 9bb258382b9b..e0b0dae584a2 100644 --- a/torch/distributed/_pipeline/sync/skip/skippable.py +++ b/torch/distributed/_pipeline/sync/skip/skippable.py @@ -17,6 +17,7 @@ List, Optional, Set, + Sequence, Tuple, Type, TypeVar, @@ -33,7 +34,7 @@ __all__ = ["skippable", "stash", "pop", "verify_skippables"] -Tensors = Tuple[Tensor, ...] +Tensors = Sequence[Tensor] TensorOrTensors = Union[Tensor, Tensors] StashPop = Union["stash", "pop"] From f2ba3c1621f798f7c91ae7604ca24704fff2a815 Mon Sep 17 00:00:00 2001 From: Pritam Damania Date: Sun, 13 Dec 2020 17:52:07 -0800 Subject: [PATCH 214/250] Use group.WORLD appropriately in process group initialization. (#48767) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48767 As part of investigating https://github.com/pytorch/pytorch/issues/48464, I realized some weird inconsistency in how we use `_default_pg` and `group.WORLD`. `group.WORLD` apparently was an `object()` and never changed despite `_default_pg` changing. In this sense, `group.WORLD` was being used a constant to refer to the default pg, but wasn't of type PG at all. In fact the passed in group is also compared via `==` to `group.WORLD` in many places, and it just worked since the default argument was `group.WORLD`. To clean this up, I got rid of `_default_pg` completely and instead used `group.WORLD` as the default pg throughout the codebase. This also fixes the documentation issues mentioned in https://github.com/pytorch/pytorch/issues/48464. #Closes: https://github.com/pytorch/pytorch/issues/48464 ghstack-source-id: 118459779 Test Plan: waitforbuildbot Reviewed By: rohan-varma Differential Revision: D25292893 fbshipit-source-id: 9a1703c71610aee2591683ab60b010332e05e412 --- test/distributed/test_c10d.py | 14 ++ torch/distributed/distributed_c10d.py | 290 ++++++++++++++------------ 2 files changed, 166 insertions(+), 138 deletions(-) diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 3b25be6e49c1..4b3e962d835f 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -3268,6 +3268,20 @@ def forward(self, x): loss = criterion(output, target) loss.backward() + @requires_nccl() + @skip_if_not_multigpu + def test_pass_default_pg(self): + dist.init_process_group( + "nccl", + init_method=f"file://{self.file_name}", + world_size=self.world_size, + rank=self.rank, + ) + + default_pg = c10d.distributed_c10d._get_default_group() + dist.destroy_process_group(default_pg) + self.assertFalse(dist.is_initialized()) + @requires_nccl() @skip_if_not_multigpu def test_save_load_checkpoint(self): diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 83260ec8dbdf..387da70403b0 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -147,7 +147,8 @@ def __getattribute__(self, key): class group(object): - WORLD = object() + # Points to the default PG once initialized. + WORLD: Optional[ProcessGroup] = None class GroupMember(object): @@ -166,7 +167,6 @@ class GroupMember(object): _pg_group_ranks: Dict[ProcessGroup, Dict[int, int]] = {} # Default process group state -_default_pg: Optional[ProcessGroup] = None _default_pg_init_method = None # Process group count for default naming @@ -177,7 +177,7 @@ def _rank_not_in_group(group: ProcessGroup): """ Helper that checks if the current process's rank is not in a given group. """ - if group == GroupMember.WORLD: + if group is None: return False return group == GroupMember.NON_GROUP_MEMBER @@ -214,23 +214,12 @@ def _get_global_rank(group, group_rank): raise RuntimeError("The group rank is not part of the group") -def _check_default_pg() -> ProcessGroup: - """ - Helper that checks if the default ProcessGroup has been initialized, with - assertion. - """ - if _default_pg is not None: - return _default_pg - else: - raise RuntimeError("Default process group is not initialized") - - def _get_group_size(group): """ Helper that gets a given group's world size. """ - if group is GroupMember.WORLD: - default_pg = _check_default_pg() + if group is GroupMember.WORLD or group is None: + default_pg = _get_default_group() return default_pg.size() if group not in _pg_group_ranks: raise RuntimeError("The given group does not exist") @@ -306,7 +295,7 @@ def is_initialized(): """ Checking if the default process group has been initialized """ - return _default_pg is not None + return GroupMember.WORLD is not None def _get_default_group(): @@ -316,7 +305,7 @@ def _get_default_group(): if not is_initialized(): raise RuntimeError("Default process group has not been initialized, " "please make sure to call init_process_group.") - return _default_pg + return GroupMember.WORLD def _get_default_store(): @@ -326,12 +315,15 @@ def _get_default_store(): if not is_initialized(): raise RuntimeError("Default process group has not been initialized, " "please make sure to call init_process_group.") - default_pg = _check_default_pg() + default_pg = _get_default_group() _, default_store = _pg_map[default_pg] return default_store +def _update_default_pg(pg): + GroupMember.WORLD = group.WORLD = pg + -def get_backend(group=group.WORLD): +def get_backend(group=None): """ Returns the backend of the given process group. @@ -344,8 +336,8 @@ def get_backend(group=group.WORLD): The backend of the given process group as a lower case string. """ - if group == GroupMember.WORLD: - pg = _check_default_pg() + if group is None: + pg = _get_default_group() else: pg = group if _rank_not_in_group(pg): @@ -421,14 +413,13 @@ def init_process_group(backend, """ global _pg_group_ranks global _backend - global _default_pg global _default_pg_init_method if not isinstance(timeout, timedelta): raise RuntimeError("Expected timeout argument to be of type" "datetime.timedelta") - if _default_pg is not None: + if GroupMember.WORLD is not None: raise RuntimeError("trying to initialize the default process group " "twice!") @@ -450,14 +441,14 @@ def init_process_group(backend, "are ignored since they are assigned by the " "MPI runtime.".format(world_size, rank)) - _default_pg = _new_process_group_helper( + _update_default_pg(_new_process_group_helper( -1, -1, [], Backend.MPI, None, group_name=group_name, - timeout=timeout) + timeout=timeout)) else: # backward compatible API if store is None: @@ -467,17 +458,17 @@ def init_process_group(backend, store, rank, world_size = next(rendezvous_iterator) store.set_timeout(timeout) - _default_pg = _new_process_group_helper( + _update_default_pg(_new_process_group_helper( world_size, rank, [], backend, store, group_name=group_name, - timeout=timeout) + timeout=timeout)) - _pg_group_ranks[_default_pg] = {i: i for i in range(_default_pg.size())} - _backend = _pg_map[_default_pg][0] + _pg_group_ranks[GroupMember.WORLD] = {i: i for i in range(GroupMember.WORLD.size())} # type: ignore + _backend = _pg_map[GroupMember.WORLD][0] # type: ignore _default_pg_init_method = init_method # barrier at the end to ensure that once we return from this method, all @@ -537,7 +528,7 @@ def _new_process_group_helper(world_size, # If this is a subgroup (which means group_ranks is specified), # we check if the current process is a member of the new group. if not is_default_group: - global_rank = _check_default_pg().rank() + global_rank = _get_default_group().rank() if global_rank not in group_ranks: return GroupMember.NON_GROUP_MEMBER @@ -576,7 +567,7 @@ def _new_process_group_helper(world_size, return pg -def destroy_process_group(group=group.WORLD): +def destroy_process_group(group=None): """ Destroy a given process group, and deinitialize the distributed package @@ -589,15 +580,14 @@ def destroy_process_group(group=group.WORLD): global _pg_map global _pg_names global _pg_group_ranks - global _default_pg global _default_pg_init_method global _group_count if group == GroupMember.NON_GROUP_MEMBER: return - if group == GroupMember.WORLD: - pg = _default_pg + if group is None: + pg = GroupMember.WORLD else: pg = group @@ -605,8 +595,8 @@ def destroy_process_group(group=group.WORLD): if _pg_map.get(pg, None) is None: raise RuntimeError("Invalid process group specified") - if group == GroupMember.WORLD: - _default_pg = None + if group is None or group == GroupMember.WORLD: + _update_default_pg(None) _default_pg_init_method = None _pg_map.clear() _pg_names.clear() @@ -627,7 +617,7 @@ def destroy_process_group(group=group.WORLD): del _pg_group_ranks[pg] -def get_rank(group=group.WORLD): +def get_rank(group=None): """ Returns the rank of current process group @@ -636,7 +626,8 @@ def get_rank(group=group.WORLD): ``world_size``. Arguments: - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. Returns: The rank of the process group @@ -646,19 +637,20 @@ def get_rank(group=group.WORLD): if _rank_not_in_group(group): return -1 - default_pg = _check_default_pg() - if group == GroupMember.WORLD: + default_pg = _get_default_group() + if group is None or group is GroupMember.WORLD: return default_pg.rank() return _get_group_rank(group, default_pg.rank()) -def get_world_size(group=group.WORLD): +def get_world_size(group=None): """ Returns the number of processes in the current process group Arguments: - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. Returns: The world size of the process group @@ -673,7 +665,7 @@ def get_world_size(group=group.WORLD): def isend(tensor, dst, - group=group.WORLD, + group=None, tag=0): """ Sends a tensor asynchronously. @@ -681,7 +673,8 @@ def isend(tensor, Arguments: tensor (Tensor): Tensor to send. dst (int): Destination rank. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. tag (int, optional): Tag to match send with remote recv Returns: @@ -693,8 +686,8 @@ def isend(tensor, if _rank_not_in_group(group): return - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() return default_pg.send([tensor], dst, tag) else: group_dst_rank = _get_group_rank(group, dst) @@ -703,7 +696,7 @@ def isend(tensor, def irecv(tensor, src, - group=group.WORLD, + group=None, tag=0): """ Receives a tensor asynchronously. @@ -711,7 +704,8 @@ def irecv(tensor, Arguments: tensor (Tensor): Tensor to fill with received data. src (int): Source rank. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. tag (int, optional): Tag to match recv with remote send Returns: @@ -723,8 +717,8 @@ def irecv(tensor, if _rank_not_in_group(group): return - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() return default_pg.recv([tensor], src, tag) else: group_src_rank = _get_group_rank(group, src) @@ -733,7 +727,7 @@ def irecv(tensor, def send(tensor, dst, - group=group.WORLD, + group=None, tag=0): """ Sends a tensor synchronously. @@ -741,7 +735,8 @@ def send(tensor, Arguments: tensor (Tensor): Tensor to send. dst (int): Destination rank. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. tag (int, optional): Tag to match send with remote recv """ @@ -749,8 +744,8 @@ def send(tensor, if _rank_not_in_group(group): return - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() default_pg.send([tensor], dst, tag).wait() else: group_dst_rank = _get_group_rank(group, dst) @@ -759,7 +754,7 @@ def send(tensor, def recv(tensor, src=None, - group=group.WORLD, + group=None, tag=0): """ Receives a tensor synchronously. @@ -768,7 +763,8 @@ def recv(tensor, tensor (Tensor): Tensor to fill with received data. src (int, optional): Source rank. Will receive from any process if unspecified. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. tag (int, optional): Tag to match recv with remote send Returns: @@ -780,8 +776,8 @@ def recv(tensor, if _rank_not_in_group(group): return -1 - if group == GroupMember.WORLD: - pg = _check_default_pg() + if group is None: + pg = _get_default_group() else: pg = group @@ -789,12 +785,12 @@ def recv(tensor, work = pg.recv_anysource([tensor], tag) work.wait() src_rank = work._source_rank() - if group == GroupMember.WORLD: + if group is None or group is GroupMember.WORLD: return src_rank else: return _get_global_rank(pg, src_rank) else: - if group == GroupMember.WORLD: + if group is None or group is GroupMember.WORLD: pg.recv([tensor], src, tag).wait() else: group_src_rank = _get_group_rank(pg, src) @@ -816,17 +812,18 @@ class P2POp(object): ``torch.distributed.irecv``. tensor (Tensor): Tensor to send or receive. peer (int): Destination or source rank. - group (ProcessGroup, optional): The process group to work on. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. tag (int, optional): Tag to match send with recv. """ - def __init__(self, op, tensor, peer, group=group.WORLD, tag=0): + def __init__(self, op, tensor, peer, group=None, tag=0): self.op = op self.tensor = tensor self.peer = peer self.group = group self.tag = tag - def __new__(cls, op, tensor, peer, group=group.WORLD, tag=0): + def __new__(cls, op, tensor, peer, group=None, tag=0): _check_op(op) _check_single_tensor(tensor, "tensor") return object.__new__(cls) @@ -896,7 +893,7 @@ def batch_isend_irecv(p2p_op_list): def broadcast_multigpu(tensor_list, src, - group=group.WORLD, + group=None, async_op=False, src_tensor=0): """ @@ -920,7 +917,8 @@ def broadcast_multigpu(tensor_list, for all the distributed processes calling this function. src (int): Source rank. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op src_tensor (int, optional): Source tensor rank within ``tensor_list`` @@ -936,8 +934,8 @@ def broadcast_multigpu(tensor_list, opts.rootRank = src opts.rootTensor = src_tensor - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() work = default_pg.broadcast(tensor_list, opts) else: group_src_rank = _get_group_rank(group, src) @@ -951,7 +949,7 @@ def broadcast_multigpu(tensor_list, def broadcast(tensor, src, - group=group.WORLD, + group=None, async_op=False): """ Broadcasts the tensor to the whole group. @@ -963,7 +961,8 @@ def broadcast(tensor, tensor (Tensor): Data to be sent if ``src`` is the rank of current process, and tensor to be used to save received data otherwise. src (int): Source rank. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -979,8 +978,8 @@ def broadcast(tensor, opts.rootRank = src opts.rootTensor = 0 - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() work = default_pg.broadcast([tensor], opts) else: group_src_rank = _get_group_rank(group, src) @@ -994,7 +993,7 @@ def broadcast(tensor, def all_reduce_multigpu(tensor_list, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False): r""" Reduces the tensor data across all machines in such a way that all get @@ -1020,7 +1019,8 @@ def all_reduce_multigpu(tensor_list, op (optional): One of the values from ``torch.distributed.ReduceOp`` enum. Specifies an operation used for element-wise reductions. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1035,8 +1035,8 @@ def all_reduce_multigpu(tensor_list, opts = AllreduceOptions() opts.reduceOp = op - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.allreduce(tensor_list, opts) else: work = group.allreduce(tensor_list, opts) @@ -1049,7 +1049,7 @@ def all_reduce_multigpu(tensor_list, def all_reduce(tensor, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False): """ Reduces the tensor data across all machines in such a way that all get @@ -1065,7 +1065,8 @@ def all_reduce(tensor, op (optional): One of the values from ``torch.distributed.ReduceOp`` enum. Specifies an operation used for element-wise reductions. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1107,8 +1108,8 @@ def all_reduce(tensor, opts = AllreduceOptions() opts.reduceOp = op - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.allreduce([tensor], opts) else: work = group.allreduce([tensor], opts) @@ -1121,7 +1122,7 @@ def all_reduce(tensor, def all_reduce_coalesced(tensors, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False): """ WARNING: at this time individual shape checking is not implemented across nodes. @@ -1146,7 +1147,8 @@ def all_reduce_coalesced(tensors, op (Optional[ReduceOp]): One of the values from ``torch.distributed.ReduceOp`` enum. Specifies an operation used for element-wise reductions. - group (Optional[ProcessGroup]): The process group to work on. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (Optional[bool]): Whether this op should be an async op. Returns: @@ -1165,8 +1167,8 @@ def all_reduce_coalesced(tensors, opts = AllreduceCoalescedOptions() opts.reduceOp = op - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.allreduce_coalesced(tensors, opts) else: work = group.allreduce_coalesced(tensors, opts) @@ -1180,7 +1182,7 @@ def all_reduce_coalesced(tensors, def reduce_multigpu(tensor_list, dst, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False, dst_tensor=0): """ @@ -1202,7 +1204,8 @@ def reduce_multigpu(tensor_list, op (optional): One of the values from ``torch.distributed.ReduceOp`` enum. Specifies an operation used for element-wise reductions. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op dst_tensor (int, optional): Destination tensor rank within ``tensor_list`` @@ -1220,8 +1223,8 @@ def reduce_multigpu(tensor_list, opts.rootRank = dst opts.rootTensor = dst_tensor - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() work = default_pg.reduce(tensor_list, opts) else: group_dst_rank = _get_group_rank(group, dst) @@ -1237,7 +1240,7 @@ def reduce_multigpu(tensor_list, def reduce(tensor, dst, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False): """ Reduces the tensor data across all machines. @@ -1251,7 +1254,8 @@ def reduce(tensor, op (optional): One of the values from ``torch.distributed.ReduceOp`` enum. Specifies an operation used for element-wise reductions. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1267,8 +1271,8 @@ def reduce(tensor, opts.reduceOp = op opts.rootRank = dst - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() work = default_pg.reduce([tensor], opts) else: group_dst_rank = _get_group_rank(group, dst) @@ -1283,7 +1287,7 @@ def reduce(tensor, def all_gather_multigpu(output_tensor_lists, input_tensor_list, - group=group.WORLD, + group=None, async_op=False): """ Gathers tensors from the whole group in a list. @@ -1318,7 +1322,8 @@ def all_gather_multigpu(output_tensor_lists, Note that ``len(input_tensor_list)`` needs to be the same for all the distributed processes calling this function. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1332,8 +1337,8 @@ def all_gather_multigpu(output_tensor_lists, output_tensor_lists = [[t if not t.is_complex() else torch.view_as_real(t) for t in l] for l in output_tensor_lists] input_tensor_list = [t if not t.is_complex() else torch.view_as_real(t) for t in input_tensor_list] - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.allgather(output_tensor_lists, input_tensor_list) else: work = group.allgather(output_tensor_lists, input_tensor_list) @@ -1358,7 +1363,7 @@ def _tensor_to_object(tensor, tensor_size): return out -def all_gather_object(object_list, obj, group=group.WORLD): +def all_gather_object(object_list, obj, group=None): """ Gathers picklable objects from the whole group into a list. Similar to :func:`all_gather`, but Python objects can be passed in. Note that the object @@ -1427,7 +1432,7 @@ def all_gather_object(object_list, obj, group=group.WORLD): ] # Allgather tensor sizes all_gather(object_size_list, local_size, group=group) - max_object_size = int(max(object_size_list).item()) + max_object_size = int(max(object_size_list).item()) # type: ignore # Resize tensor to max size across all ranks. input_tensor.resize_(max_object_size) coalesced_output_tensor = torch.empty( @@ -1446,7 +1451,7 @@ def all_gather_object(object_list, obj, group=group.WORLD): object_list[i] = _tensor_to_object(tensor, tensor_size) -def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD): +def gather_object(obj, object_gather_list=None, dst=0, group=None): """ Gathers picklable objects from the whole group in a single process. Similar to :func:`gather`, but Python objects can be passed in. Note that the @@ -1518,7 +1523,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD): # gather, since each rank needs to broadcast a tensor of the same (maximal) # size. all_gather(object_size_list, local_size, group=group) - max_object_size = int(max(object_size_list).item()) + max_object_size = int(max(object_size_list).item()) # type: ignore # Resize tensor to max size across all ranks. input_tensor.resize_(max_object_size) # Avoid populating output tensors if the result won't be gathered on this rank. @@ -1546,7 +1551,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=group.WORLD): object_gather_list[i] = _tensor_to_object(tensor, tensor_size) -def broadcast_object_list(object_list, src, group=group.WORLD): +def broadcast_object_list(object_list, src, group=None): """ Broadcasts picklable objects in ``object_list`` to the whole group. Similar to :func:`broadcast`, but Python objects can be passed in. @@ -1739,7 +1744,7 @@ def scatter_object_list( def all_gather(tensor_list, tensor, - group=group.WORLD, + group=None, async_op=False): """ Gathers tensors from the whole group in a list. @@ -1750,7 +1755,8 @@ def all_gather(tensor_list, tensor_list (list[Tensor]): Output list. It should contain correctly-sized tensors to be used for output of the collective. tensor (Tensor): Tensor to be broadcast from current process. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1795,8 +1801,8 @@ def all_gather(tensor_list, tensor_list = [t if not t.is_complex() else torch.view_as_real(t) for t in tensor_list] tensor = tensor if not tensor.is_complex() else torch.view_as_real(tensor) - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.allgather([tensor_list], [tensor]) else: work = group.allgather([tensor_list], [tensor]) @@ -1808,7 +1814,7 @@ def all_gather(tensor_list, def all_gather_coalesced(output_tensor_lists, input_tensor_list, - group=group.WORLD, + group=None, async_op=False): """ Gathers input tensors from the whole group in a list in a coalesced manner. @@ -1820,7 +1826,8 @@ def all_gather_coalesced(output_tensor_lists, correctly-sized tensors to be used for output of the collective. input_tensor_list (list[Tensor]): Tensors to be broadcast from current process. At least one tensor has to be non empty. - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op. Returns: @@ -1866,8 +1873,8 @@ def all_gather_coalesced(output_tensor_lists, output_tensor_lists = [[t if not t.is_complex() else torch.view_as_real(t) for t in l] for l in output_tensor_lists] input_tensor_list = [t if not t.is_complex() else torch.view_as_real(t) for t in input_tensor_list] - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.allgather_coalesced( output_tensor_lists, input_tensor_list) else: @@ -1894,7 +1901,7 @@ def _validate_output_list_for_rank(my_rank, dst, gather_list): def gather(tensor, gather_list=None, dst=0, - group=group.WORLD, + group=None, async_op=False): """ Gathers a list of tensors in a single process. @@ -1905,7 +1912,8 @@ def gather(tensor, tensors to use for gathered data (default is None, must be specified on the destination rank) dst (int, optional): Destination rank (default is 0) - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1932,8 +1940,8 @@ def gather(tensor, opts = GatherOptions() opts.rootRank = dst - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() work = default_pg.gather(output_tensors, input_tensors, opts) else: group_dst_rank = _get_group_rank(group, dst) @@ -1949,7 +1957,7 @@ def gather(tensor, def scatter(tensor, scatter_list=None, src=0, - group=group.WORLD, + group=None, async_op=False): """ Scatters a list of tensors to all processes in a group. @@ -1962,7 +1970,8 @@ def scatter(tensor, scatter_list (list[Tensor]): List of tensors to scatter (default is None, must be specified on the source rank) src (int): Source rank (default is 0) - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -1998,8 +2007,8 @@ def scatter(tensor, opts = ScatterOptions() opts.rootRank = src - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None or group is GroupMember.WORLD: + default_pg = _get_default_group() work = default_pg.scatter(output_tensors, input_tensors, opts) else: group_src_rank = _get_group_rank(group, src) @@ -2015,7 +2024,7 @@ def scatter(tensor, def reduce_scatter_multigpu(output_tensor_list, input_tensor_lists, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False): """ Reduce and scatter a list of tensors to the whole group. Only nccl backend @@ -2049,7 +2058,8 @@ def reduce_scatter_multigpu(output_tensor_list, therefore ``len(input_tensor_lists[i])``) need to be the same for all the distributed processes calling this function. - group (ProcessGroup, optional): The process group to work on. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op. Returns: @@ -2063,8 +2073,8 @@ def reduce_scatter_multigpu(output_tensor_list, opts = ReduceScatterOptions() opts.reduceOp = op - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.reduce_scatter( output_tensor_list, input_tensor_lists, @@ -2086,7 +2096,7 @@ def reduce_scatter_multigpu(output_tensor_list, def reduce_scatter(output, input_list, op=ReduceOp.SUM, - group=group.WORLD, + group=None, async_op=False): """ Reduces, then scatters a list of tensors to all processes in a group. @@ -2094,7 +2104,8 @@ def reduce_scatter(output, Arguments: output (Tensor): Output tensor. input_list (list[Tensor]): List of tensors to reduce and scatter. - group (ProcessGroup, optional): The process group to work on. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op. Returns: @@ -2110,8 +2121,8 @@ def reduce_scatter(output, opts = ReduceScatterOptions() opts.reduceOp = op - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.reduce_scatter([output], [input_list], opts) else: work = group.reduce_scatter([output], [input_list], opts) @@ -2126,7 +2137,7 @@ def all_to_all_single(output, input, output_split_sizes=None, input_split_sizes=None, - group=group.WORLD, + group=None, async_op=False): """ Each process splits input tensor and then scatters the split list @@ -2142,7 +2153,8 @@ def all_to_all_single(output, input_split_sizes: (list[Int], optional): Input split sizes for dim 0 if specified None or empty, dim 0 of ``input`` tensor must divide equally by ``world_size``. - group (ProcessGroup, optional): The process group to work on. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op. Returns: @@ -2206,8 +2218,8 @@ def all_to_all_single(output, output_split_sizes = [] if output_split_sizes is None else output_split_sizes input_split_sizes = [] if input_split_sizes is None else input_split_sizes - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.alltoall_base(output, input, output_split_sizes, input_split_sizes, opts) else: work = group.alltoall_base(output, input, output_split_sizes, input_split_sizes, opts) @@ -2219,7 +2231,7 @@ def all_to_all_single(output, def all_to_all(output_tensor_list, input_tensor_list, - group=group.WORLD, + group=None, async_op=False): """ Each process scatters list of input tensors to all processes in a group and @@ -2229,7 +2241,8 @@ def all_to_all(output_tensor_list, output_tensor_list (list[Tensor]): List of tensors to be gathered one per rank. input_tensor_list (list[Tensor]): List of tensors to scatter one per rank. - group (ProcessGroup, optional): The process group to work on. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op. Returns: @@ -2297,8 +2310,8 @@ def all_to_all(output_tensor_list, _check_tensor_list(output_tensor_list, "output_tensor_list") _check_tensor_list(input_tensor_list, "input_tensor_list") - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.alltoall(output_tensor_list, input_tensor_list, opts) else: work = group.alltoall(output_tensor_list, input_tensor_list, opts) @@ -2309,7 +2322,7 @@ def all_to_all(output_tensor_list, work.wait() -def barrier(group=group.WORLD, +def barrier(group=GroupMember.WORLD, async_op=False): """ Synchronizes all processes. @@ -2318,7 +2331,8 @@ def barrier(group=group.WORLD, if async_op is False, or if async work handle is called on wait(). Arguments: - group (ProcessGroup, optional): The process group to work on + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. async_op (bool, optional): Whether this op should be an async op Returns: @@ -2328,8 +2342,8 @@ def barrier(group=group.WORLD, if _rank_not_in_group(group): return - if group == GroupMember.WORLD: - default_pg = _check_default_pg() + if group is None: + default_pg = _get_default_group() work = default_pg.barrier() else: work = group.barrier() @@ -2380,7 +2394,7 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None): global _pg_group_ranks - default_pg = _check_default_pg() + default_pg = _get_default_group() default_backend, default_store = _pg_map[default_pg] global_rank = default_pg.rank() global_world_size = default_pg.size() From 38ed398580208a79e92368096d87674793c0cd4b Mon Sep 17 00:00:00 2001 From: Jordan Fix Date: Sun, 13 Dec 2020 18:04:13 -0800 Subject: [PATCH 215/250] [fx] Add constant folding pass (#48443) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48443 Add a constant folding pass in FX: - Iterate over an input graph and tag what nodes are fully constant, i.e. either `get_attr` nodes, or nodes with all inputs that are either `get_attr` or constant - Use `model_transform.split_by_tags()` to split the graph into two - Look for the `output` node in the constant graph to get names of attrs that will be folded - Iterate over the non-constant graph and replace placeholders that are using the same name as the attrs with a `get_attr` as well as a dummy attr on the module - Return these two graphs in a new `FoldedGraphModule`, which is a normal GraphModule but also stores the constant graph on the side along with a `run_folding()` method that will run const folding and update the dummy parameters with the actual folded parameters Test Plan: Added a couple tests Reviewed By: 842974287 Differential Revision: D25033996 fbshipit-source-id: 589c036751ea91bb8155d9be98af7dbc0552ea19 --- test/fx/test_fx_const_fold.py | 274 ++++++++++++++++++++++++++++ torch/fx/experimental/const_fold.py | 269 +++++++++++++++++++++++++++ torch/fx/graph.py | 2 +- 3 files changed, 544 insertions(+), 1 deletion(-) create mode 100644 test/fx/test_fx_const_fold.py create mode 100644 torch/fx/experimental/const_fold.py diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py new file mode 100644 index 000000000000..db06663285da --- /dev/null +++ b/test/fx/test_fx_const_fold.py @@ -0,0 +1,274 @@ +import unittest + +import torch +from torch.fx.experimental import const_fold + + +class TestConstFold(unittest.TestCase): + def _verify_const_fold_mod(self, mod_folded: const_fold.FoldedGraphModule): + self.assertTrue(mod_folded.const_subgraph_module is not None) + + # Check that the constants are attributes in the main subgraph. + num_folded_attrs = 0 + for node in mod_folded.graph.nodes: + if node.op == "get_attr" and (node.target in mod_folded.const_output_names): + num_folded_attrs += 1 + self.assertEqual(num_folded_attrs, len(mod_folded.const_output_names)) + + def test_const_fold_basic_one_attr_no_name_collision(self): + r""" + Perform constant folding conversion, from original mod to split constant folding + module with two split subgraphs, where there's a single attr to fold and + a single output attr result to replace. + + attr1 attr1 + | | | | + x add add + \ / | + sub y output (becomes attr add_1) + \ / ==> -------+------- (const/base subgraph split) + mul attr2 x / (input from previous subgraph + \ / \ / is attr) + add sub y + | \ / + output mul attr2 + \ / + add + | + output + """ + + class ConstFoldTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.attr_1 = torch.nn.Parameter(torch.tensor([[-0.9]])) + self.attr_2 = torch.nn.Parameter(torch.tensor([[17.1]])) + + def forward(self, x, y): + a = self.attr_1 + self.attr_1 + x = x - a + return x * y + self.attr_2 + + mod = ConstFoldTestModule() + mod_folded: const_fold.FoldedGraphModule = const_fold.split_const_subgraphs(mod) + self._verify_const_fold_mod(mod_folded) + + # Now run both folded and non-folded to check results equal. + in_x, in_y = torch.tensor([[-0.45]]), torch.tensor([0.9]) + base_result = mod(in_x, in_y) + fold_result = mod_folded(in_x, in_y) + self.assertTrue(torch.equal(fold_result, base_result)) + + def test_const_fold_basic_one_attr_name_collision(self): + r""" + Perform constant folding conversion, from original mod to split constant folding + module with two split subgraphs, where there's a single attr to fold and + a single output attr result to replace. Name the attrs such that they will + collide by name with folded attrs. + + add_1 add_1 + | | | | + x add add + \ / | + sub y output (becomes attr add_1) + \ / ==> -------+------- (const/base subgraph split) + mul add_2 x / (input from previous subgraph + \ / \ / is attr) + add sub y + | \ / + output mul add_2 + \ / + add + | + output + """ + + class ConstFoldTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + # Note: Named as such to result in name collision. + self.add_1__CF = torch.nn.Parameter(torch.tensor([[1.0]])) + self.add_2__CF = torch.nn.Parameter(torch.tensor([[17.1]])) + + def forward(self, x, y): + a = self.add_1__CF + self.add_1__CF + x = x - a + return x * y + self.add_2__CF + + mod = ConstFoldTestModule() + mod_folded: const_fold.FoldedGraphModule = const_fold.split_const_subgraphs(mod) + self._verify_const_fold_mod(mod_folded) + + # Now run both folded and non-folded to check results equal. + in_x, in_y = torch.tensor([[5.0]]), torch.tensor([4.0]) + base_result = mod(in_x, in_y) + fold_result = mod_folded(in_x, in_y) + self.assertTrue(torch.equal(fold_result, base_result)) + + def test_const_fold_noop(self): + r""" + Check that a graph with no constant folding is handled correctly. + + x attr1 + \ / + sub + | + output + """ + + class ConstFoldTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.attr1 = torch.nn.Parameter(torch.tensor([[-0.9]])) + + def forward(self, x): + return x - self.attr1 + + mod = ConstFoldTestModule() + mod_folded: const_fold.FoldedGraphModule = const_fold.split_const_subgraphs(mod) + + # Check that the folded graph module is None, since there was no folding to do. + self.assertTrue(mod_folded.const_subgraph_module is None) + + # Now run both folded and non-folded to check results equal. + in_x = torch.tensor([[-0.45]]) + base_result = mod(in_x) + fold_result = mod_folded(in_x) + self.assertTrue(torch.equal(fold_result, base_result)) + + def test_const_fold_basic_two_attr_three_input(self): + r""" + Perform constant folding conversion, from original mod to split constant + folding module with two split subgraphs, where there are two attrs to + fold into a single output, and there are three placeholder inputs. + + attr1 attr2 attr1 attr2 + \ / \ / + x add add + \ / | + sub y output (becomes attr add_1) + \ / ==> -------+------- (const/base subgraph split) + mul z x / (input from previous subgraph + \ / \ / is attr) + div sub y + | \ / + output mul z + \ / + div + | + output + """ + + class ConstFoldTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.attr1 = torch.nn.Parameter(torch.tensor([[-0.9]])) + self.attr1 = torch.nn.Parameter(torch.tensor([[1.32]])) + + def forward(self, x, y, z): + a = self.attr1 + self.attr1 + sub = x - a + mul = sub * y + return mul / z + + mod = ConstFoldTestModule() + mod_folded: const_fold.FoldedGraphModule = const_fold.split_const_subgraphs(mod) + self._verify_const_fold_mod(mod_folded) + + # Now run both folded and non-folded to check results equal. + in_x, in_y, in_z = ( + torch.tensor([[-0.45]]), + torch.tensor([0.9]), + torch.tensor([1.1]), + ) + base_result = mod(in_x, in_y, in_z) + fold_result = mod_folded(in_x, in_y, in_z) + self.assertTrue(torch.equal(fold_result, base_result)) + + def test_const_fold_basic_two_attr(self): + r""" + Perform constant folding conversion, from original mod to split constant + folding module with two split subgraphs, where there are two attrs to + fold into a single output. + + attr1 attr2 attr1 attr2 + \ / \ / + x add add (becomes attr add_1) + \ / ==> -------+------- (const/base subgraph split) + sub x | (input from previous subgraph is attr) + | \ / + output sub + | + output + """ + + class ConstFoldTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.attr1 = torch.nn.Parameter(torch.randn(2, 3)) + self.attr2 = torch.nn.Parameter(torch.randn(2, 3)) + + def forward(self, x): + y = self.attr1 + self.attr2 + return x + y + + mod = ConstFoldTestModule() + mod_folded: const_fold.FoldedGraphModule = const_fold.split_const_subgraphs(mod) + self._verify_const_fold_mod(mod_folded) + + # Now run both folded and non-folded to check results equal. + in_x = torch.randn(2, 3) + fold_result = mod_folded(in_x) + base_result = mod(in_x) + self.assertTrue(torch.equal(fold_result, base_result)) + + def test_const_fold_multi_const_folded_attrs(self): + r""" + Perform constant folding conversion, from original mod to split constant + folding module with two split subgraphs, where there are two attrs to + fold into two new attrs. + + attr1 attr2 attr1 attr2 + / \ | / \ | + permute | sum permute | sum + \ / / \ / | + x add y / add | + \ / \ / | | + sub add output output (become attrs add_1 and mul_1) + \ / ==> --------+-------+------ (const/base subgraph split) + \ / x | y | (inputs from previous subgraph + add \ / \ / are attrs) + | sub add + linear \ / + | add + sigmoid | + | linear + output | + sigmoid + | + output + """ + + class ConstFoldTestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.attr1 = torch.nn.Parameter(torch.randn(4, 4)) + self.attr2 = torch.nn.Parameter(torch.randn(4, 4)) + self.lin = torch.nn.Linear(4, 4) + + def forward(self, x, y): + a = self.attr1 + self.attr1.permute(1, 0) + x = x - a + amax = torch.sum(self.attr2, dim=1) + y = y + amax + return torch.sigmoid(self.lin(x + y)) + + mod = ConstFoldTestModule() + mod_folded: const_fold.FoldedGraphModule = const_fold.split_const_subgraphs(mod) + self._verify_const_fold_mod(mod_folded) + + # Now run both folded and non-folded to check results equal. + in_x, in_y = torch.randn(4, 4), torch.randn(4) + fold_result = mod_folded(in_x, in_y) + base_result = mod(in_x, in_y) + self.assertTrue(torch.equal(fold_result, base_result)) diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py new file mode 100644 index 000000000000..4b5ea52c5c1f --- /dev/null +++ b/torch/fx/experimental/const_fold.py @@ -0,0 +1,269 @@ +import operator +from typing import Dict, Set, List, Optional + +import torch.fx +from torch.fx.experimental.subgraph_creation_example import split_module +import re + + +def _make_tuple(x): + """ + Helper to convert x into a one item tuple if it's not a tuple already. + """ + return x if isinstance(x, tuple) else (x,) + + +class FoldedGraphModule(torch.fx.GraphModule): + """ + FoldedGraphModule is a GraphModule which also contains another + `const_subgraph_module` representing a subgraph which has all const attr + inputs and which can be run once before running the main standard + `graph`. The `const_output_names` are the ordered list names of attrs which + represent what each respective output from the const_subgraph should be set + on which attrs. + """ + + def __init__( + self, + root: torch.nn.Module, + graph: torch.fx.Graph, + const_subgraph: Optional[torch.fx.Graph] = None, + const_output_names: Optional[List[str]] = None, + ): + super().__init__(root, graph) + self.const_subgraph_module = ( + None + if const_subgraph is None + else torch.fx.GraphModule(root, const_subgraph) + ) + self.const_output_names = const_output_names + self.has_folding_been_run = False + + def __call__(self, *args, **kwargs): + if not self.has_folding_been_run: + self.run_folding() + return super().__call__(*args) + + def run_folding(self): + # If there's no const subgraph module or attr output names to use, return + # early as there is no const folding to perform. + if self.const_subgraph_module is None or self.const_output_names is None: + return + + assert not self.has_folding_been_run + self.has_folding_been_run = True + + # Actually run const folding subgraph. We _make_tuple here because + # single attr const fold subgraphs output a single Tensor while + # multiple outputs are returned as Tuple[Tensor,]. + folded_attrs = _make_tuple(self.const_subgraph_module()) + + # Look for output node from const folding subgraph and set attrs on the + # module with the results. + for i in range(len(folded_attrs)): + setattr( + self, self.const_output_names[i], torch.nn.Parameter(folded_attrs[i]) + ) + + +def split_const_subgraphs( + module: torch.nn.Module, +) -> FoldedGraphModule: + """ + Looks through `module` for any nodes that have all constant attribute inputs + and separates them out into their own constant subgraph, and returns a + FoldedGraphModule which runs that constant subgraph on the first run to set + attributes on the module prior to running the non-constant portion of the + graph. + """ + mod_traced = torch.fx.symbolic_trace(module) + + # Build up a list of const_nodes, defined as nodes that are themselves + # get_attrs, or have all get_attr or other constant node inputs. + const_nodes: Set[torch.fx.Node] = set() + found_const_folding = False + for node in mod_traced.graph.nodes: + # Skip over placeholders/outputs because they can't be const folded and + # we don't want to add tags to them. + if node.op in {"placeholder", "output"}: + continue + + # If the node itself is constant, or all of its inputs are constant, + # then tag it as constant. + if node.op == "get_attr" or set(node.all_input_nodes).issubset(const_nodes): + const_nodes.add(node) + if node.op != "get_attr": + found_const_folding = True + + # If we did not find any const folding then return early without a const fold subgraph. + if not found_const_folding: + return FoldedGraphModule(mod_traced, mod_traced.graph) + + # Partition the module into two: submod_0 for constant folding subgraph, and + # submod_1 for the rest. + def mod_partition(node: torch.fx.Node): + return 0 if node in const_nodes else 1 + + split = split_module(mod_traced, module, mod_partition) + + # Gather all names that are output from the const folding subgraph, which we + # will need to set dummy params on the module. + const_output_names: List[str] = [] + for node in split.submod_0.graph.nodes: + if node.op == "output": + # Note: we _make_tuple here because the output Node either contains + # a single output Node, or Tuple[Node], so this simplifies things. + const_output_names = [o.name for o in _make_tuple(node.args[0])] + break + + # Make sure the attr name we want to use is uniquely named in the module. + for i in range(len(const_output_names)): + # Add a suffix to make it easier to tell these were the result of const folding. + name = const_output_names[i] + "__CF" + # Delete all characters that are illegal in a Python identifier. + name = re.sub("[^0-9a-zA-Z_]+", "_", name) + if name[0].isdigit(): + name = f"_{name}" + # Now make sure it is in fact unique to the module by incrementing suffix value. + while hasattr(mod_traced, name): + match = re.match(r"(.*)_(\d+)$", name) + if match is None: + name = name + "_1" + else: + base, num = match.group(1, 2) + name = f"{base}_{int(num) + 1}" + const_output_names[i] = name + + # Now track the const_output_names to what name is used in the parent graph + # from the split via call_function getitem, to see what order it is passed + # into the non-const subgraph submod_1. First look to the parent module + # containing/calling into the const/non-const submodules to determine what + # the inputs are to each. Note if submod_0 had a single output then there is + # no getitem, and we can simply use the output from the call to submoid_0. + call_submod_0_args, call_submod_1_args = None, None + orig_ph_targets: List[str] = [] + for node in split.graph.nodes: + if node.op == "placeholder": + orig_ph_targets.append(node.target) + + if node.op == "call_module": + if node.target == "submod_0": + call_submod_0_args = node.args + continue + elif node.target == "submod_1": + call_submod_1_args = node.args + continue + assert call_submod_0_args is not None and call_submod_1_args is not None + + # Look through the args for the call into submod_1, and find the args that + # come from submod_0. Also look for get_attrs fed directly from the parent + # split into submod_1, i.e. those attrs that are not constant folded. + submod_1_input_idx_to_folded_attr_name: Dict[int, str] = {} + submod_1_input_idx_to_unfolded_attr_name: Dict[int, str] = {} + for i, node in enumerate(call_submod_1_args): + const_output_name = None + # If we only had a single output from submod_0 then we simply look for + # the call_module into it. + if len(const_output_names) == 1: + if node.op == "call_module" and node.target == "submod_0": + const_output_name = const_output_names[0] + + # Else we had multiple outputs from submod_0, so we need to look for all + # getitems from the call to it. + else: + if ( + node.op == "call_function" + and node.target == operator.__getitem__ + and node.args[0].target == "submod_0" + ): + const_output_name = const_output_names[node.args[1]] + + # Now map from the index of the constant into calling submod_1 and map + # to the constant output name, which we use for swapping in getattrs + # instead of placeholders in submod_1. + if const_output_name is not None: + submod_1_input_idx_to_folded_attr_name[i] = const_output_name + elif node.op == "get_attr": + submod_1_input_idx_to_unfolded_attr_name[i] = node.target + + assert len(submod_1_input_idx_to_folded_attr_name) == len(const_output_names) + + # Now we have a mapping from const output names to the index they are passed + # into submod_1, so swap in getattrs for placeholders. + ph_idx = 0 + for node in split.submod_1.graph.nodes: + if node.op != "placeholder": + continue + is_folded_attr = ph_idx in submod_1_input_idx_to_folded_attr_name.keys() + is_unfolded_attr = ph_idx in submod_1_input_idx_to_unfolded_attr_name.keys() + if not is_folded_attr and not is_unfolded_attr: + ph_idx += 1 + continue + + const_output_name = ( + submod_1_input_idx_to_folded_attr_name[ph_idx] + if is_folded_attr + else submod_1_input_idx_to_unfolded_attr_name[ph_idx] + ) + if is_folded_attr: + assert not hasattr(mod_traced, const_output_name) + # Use a dummy param, which will be overwritten when we run const folding. + setattr( + mod_traced, + const_output_name, + torch.nn.Parameter(torch.randn(1)), + ) + with split.submod_1.graph.inserting_before(node): + node.replace_all_uses_with(split.submod_1.graph.get_attr(const_output_name)) + split.submod_1.graph.erase_node(node) + ph_idx += 1 + + # We may need to reorder placeholders to ensure they have the same order as + # they do in the original split. + ph_idx = 0 + node = next(iter(split.submod_1.graph.nodes)) + while node.op != "root": + if node.op != "placeholder": + node = node.next + continue + + curr_orig_ph_target = orig_ph_targets[ph_idx] + ph_idx += 1 + # If this ph is in the correct position, nothing to do. + if curr_orig_ph_target == node.target: + node = node.next + continue + + # This ph is not in the correct order, so search the rest of the graph + # for the ph we expected and prepend it before the current ph. + later_node = node.next + while later_node.op != "root": + if ( + later_node.op == "placeholder" + and curr_orig_ph_target == later_node.target + ): + break + later_node = later_node.next + assert later_node.op != "root" + node.prepend(later_node) + # Note we do not increment node here, as it still may be in the wrong + # place (we just prepended the ph that should have come before it). + + # split_module currently does not use get_attrs for attrs. Instead it passes + # them in as args from the parent module, which used get_attrs. Here we set + # them as get_attrs inside submod_0, allowing for running folding without + # somehow a priori knowing the attrs that should be passed as args. We can + # unconditionally do this for all placeholders because we know all + # placeholders to submod_0 must be constants accessible via get_attr. + for node in split.submod_0.graph.nodes: + if node.op != "placeholder": + continue + in_node = next(n for n in call_submod_0_args if n.name == node.target) + assert in_node.op == "get_attr" + with split.submod_0.graph.inserting_before(node): + node.replace_all_uses_with(split.submod_0.graph.get_attr(in_node.target)) + split.submod_0.graph.erase_node(node) + + return FoldedGraphModule( + mod_traced, split.submod_1.graph, split.submod_0.graph, const_output_names + ) diff --git a/torch/fx/graph.py b/torch/fx/graph.py index f8bc96b73c40..e6fc19a1394e 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -549,7 +549,7 @@ def illegal_shadowing_name(name : str) -> bool: _shadows_builtin_name(name) while candidate in self._used_names or illegal_shadowing_name(candidate): - match = re.match(r"(.*)_(\d+)", candidate) + match = re.match(r"(.*)_(\d+)$", candidate) if match is None: candidate = candidate + '_1' else: From fdadfb6e5de2c1d9494c0ed64b0d863e5129b727 Mon Sep 17 00:00:00 2001 From: Kurt Mohler Date: Sun, 13 Dec 2020 19:48:56 -0800 Subject: [PATCH 216/250] Fix formatting error in `set_deterministic` documentation (#49136) Summary: Fixes formatting error that was preventing a bulleted list from being displayed properly Pull Request resolved: https://github.com/pytorch/pytorch/pull/49136 Reviewed By: zou3519 Differential Revision: D25493130 Pulled By: mruberry fbshipit-source-id: 7fc21e0e2cfa9465a60d2d43b805164316375f01 --- torch/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/torch/__init__.py b/torch/__init__.py index 403c192b47e9..30c328c1da6f 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -359,11 +359,13 @@ def set_deterministic(d): * :class:`torch.nn.FractionalMaxPool2d` when called on a CUDA tensor that requires grad * :class:`torch.nn.FractionalMaxPool3d` when called on a CUDA tensor that requires grad * :func:`torch.nn.functional.interpolate` when called on a CUDA tensor that requires grad - and one of the following modes is used: - - `linear` - - `bilinear` - - `bicubic` - - `trilinear` + and one of the following modes is used: + + - `linear` + - `bilinear` + - `bicubic` + - `trilinear` + * :class:`torch.nn.ReflectionPad1d` when called on a CUDA tensor that requires grad * :class:`torch.nn.ReflectionPad2d` when called on a CUDA tensor that requires grad * :class:`torch.nn.ReplicationPad1d` when called on a CUDA tensor that requires grad From 94a3d4b083c945050a93fefcdd20d7d336185f2d Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Sun, 13 Dec 2020 20:27:06 -0800 Subject: [PATCH 217/250] Remove unused operator at::_fft_with_size (#48905) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48905 Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25480385 Pulled By: mruberry fbshipit-source-id: 192d04a1b7e33b4e408cda8a82679c3ae3490a7d --- aten/src/ATen/native/SpectralOps.cpp | 169 ------------------ aten/src/ATen/native/cuda/SpectralOps.cu | 107 ----------- aten/src/ATen/native/mkl/SpectralOps.cpp | 99 ---------- aten/src/ATen/native/native_functions.yaml | 11 -- .../check_backward_compatibility.py | 1 + tools/autograd/derivatives.yaml | 3 - torch/csrc/autograd/FunctionsManual.cpp | 88 --------- 7 files changed, 1 insertion(+), 477 deletions(-) diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 65d67629fa9f..4ae2ee326b88 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -19,12 +19,6 @@ namespace at { namespace native { -// Common code for all FFT functions -static inline Tensor _fft( - const Tensor &self, int64_t signal_ndim, bool complex_input, - const bool complex_output, bool inverse, IntArrayRef signal_sizes, - fft_norm_mode normalization, bool onesided); - namespace { // Promote inputs to FFT functions @@ -416,139 +410,6 @@ Tensor fft_ifftshift(const Tensor& x, c10::optional dim_opt) { } -// This is a pass-through wrapper function that does the size check and -// inferences. The actual forward implementation function is called -// at::_fft_with_size which dispatches to _fft_cufft (CUDA) or _fft_mkl (CPU). -static inline Tensor _fft(const Tensor &self, const int64_t signal_ndim, - const bool complex_input, const bool complex_output, - const bool inverse, IntArrayRef signal_sizes, - const fft_norm_mode normalization, const bool onesided) { - - TORCH_CHECK(signal_ndim >= 1 && signal_ndim <= 3, - "Expected signal_ndim to be 1, 2, or 3, but got signal_ndim=", - signal_ndim); - TORCH_CHECK(at::isFloatingType(self.scalar_type()), - "Expected an input tensor of floating types, but got input=", - self.toString(), self.sizes()); - - auto signal_tensor_ndim = signal_ndim + static_cast(complex_input); // add complex dim - if (self.dim() < signal_tensor_ndim) { - std::ostringstream ss; - ss << "Given signal_ndim=" << signal_ndim << ", expected an input tensor " - << "of at least " << signal_tensor_ndim << "D"; - if (complex_input) { - ss << " (complex input adds an extra dimension)"; - } - ss << ", but got input=" << self.toString() << self.sizes(); - AT_ERROR(ss.str()); - } - - auto self_shape = self.sizes(); - auto batch_ndim = self.dim() - signal_tensor_ndim; - - Tensor input = self; - // flatten the batch dims - if (batch_ndim == 0) { - // slightly faster path for non-batch mode - input = input.unsqueeze(0); - } else if (batch_ndim > 1) { - std::vector flatten_input_shape(signal_tensor_ndim + 1); - std::copy(self_shape.begin() + batch_ndim, self_shape.end(), flatten_input_shape.begin() + 1); - flatten_input_shape[0] = -1; - input = input.reshape(flatten_input_shape); - - } - - // now we assume that input is batched as [ B x signal_dims... ] - - if (complex_input) { - TORCH_CHECK(input.size(signal_ndim + 1) == 2, - "Expected an input tensor with a last dimension of size 2 " - "representing real + imaginary components, but got input ", - self.toString(), self.sizes()); - } - - // build signal_sizes and output_size - TORCH_CHECK(signal_sizes.size() == 0 || static_cast(signal_sizes.size()) == signal_ndim, - "Expected signal_sizes to be empty (default) or of signal_ndim=", - signal_ndim, "D, but got signal_sizes=", signal_sizes); - std::vector output_sizes(signal_ndim + 1 + static_cast(complex_output)); - output_sizes[0] = input.size(0); // batch size - std::vector checked_signal_sizes(signal_ndim); - for (int64_t i = 0; i < signal_ndim; i++) { - int64_t input_size = input.size(i + 1); - if (i == signal_ndim - 1 && onesided && complex_input && !complex_output) { - // If last dim and complex-to-real onesided, input is only half of - // signal, and we need to infer basing on signal_sizes, if given - // See native/SpectralOpsUtils.h for detailed description. - int64_t inferred_size; - if (signal_sizes.size() > 0) { - inferred_size = infer_ft_complex_to_real_onesided_size(input_size, signal_sizes[i]); - } else { - inferred_size = infer_ft_complex_to_real_onesided_size(input_size); - } - checked_signal_sizes[i] = inferred_size; - output_sizes[i + 1] = inferred_size; - } else { - if (i == signal_ndim - 1 && onesided && !complex_input && complex_output) { - // if last dim and real-to-complex onesided, output should be only - // half of the signal, and we need to infer using input_size - output_sizes[i + 1] = infer_ft_real_to_complex_onesided_size(input_size); - } else { - output_sizes[i + 1] = input_size; - } - checked_signal_sizes[i] = input_size; - TORCH_CHECK(signal_sizes.size() == 0 || signal_sizes[i] == checked_signal_sizes[i], - "Expected given signal_sizes=", signal_sizes," to have same " - "shape with input at signal dimension ", i, ", but got " - "signal_sizes=", signal_sizes, " and input=", self.toString(), - self.sizes()); - } - } - if (complex_output) { - output_sizes[signal_ndim + 1] = 2; - } - - Tensor output = at::_fft_with_size(input, signal_ndim, complex_input, - complex_output, inverse, - checked_signal_sizes, - static_cast(normalization), - onesided, - output_sizes); - - // unflatten the batch dims - if (batch_ndim == 0) { - // slightly faster path for non-batch mode - output = output.squeeze(0); - } else if (batch_ndim > 1) { - auto output_ndim = self.dim() + static_cast(complex_output) - static_cast(complex_input); - std::vector unflatten_output_shape(output_ndim); - std::copy(self_shape.begin(), self_shape.begin() + batch_ndim, unflatten_output_shape.begin()); - std::copy(output_sizes.begin() + 1, output_sizes.end(), unflatten_output_shape.begin() + batch_ndim); - output = output.reshape(unflatten_output_shape); - } - return output; -} - -// Wrapper to preserve the historic signature of _fft_with_size -// NOTE: This is only used for torchscript backwards compatibility and the new -// signature with normalization modes should be used in all other cases -Tensor _fft_with_size(const Tensor& input, int64_t signal_ndim, - bool complex_input, bool complex_output, - bool inverse, IntArrayRef checked_signal_sizes, - bool normalized, bool onesided, - IntArrayRef output_sizes) { - fft_norm_mode norm; - if (normalized) { - norm = fft_norm_mode::by_root_n; - } else { - norm = inverse ? fft_norm_mode::by_n : fft_norm_mode::none; - } - return at::_fft_with_size( - input, signal_ndim, complex_input, complex_output, inverse, - checked_signal_sizes, static_cast(norm), onesided, output_sizes); -} - // We call the following methods via CUDA hooks because they are really only // valid when CUDA is available. See native/cuda/CuFFTPlanCache.h for more details. int64_t _cufft_get_plan_cache_max_size(int64_t device_index) { @@ -567,36 +428,6 @@ void _cufft_clear_plan_cache(int64_t device_index) { detail::getCUDAHooks().cuFFTClearPlanCache(device_index); } -static Tensor fft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { - return _fft(self, signal_ndim, /* complex_input */ true, - /* complex_output */ true, /* inverse */ false, {}, - normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none, - /* onesided */ false); -} - -static Tensor ifft(const Tensor& self, const int64_t signal_ndim, const bool normalized) { - return _fft(self, signal_ndim, /* complex_input */ true, - /* complex_output */ true, /* inverse */ true, {}, - normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n, - /* onesided */ false); -} - -static Tensor rfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, - const bool onesided) { - return _fft(self, signal_ndim, /* complex_input */ false, - /* complex_output */ true, /* inverse */ false, {}, - normalized ? fft_norm_mode::by_root_n : fft_norm_mode::none, - onesided); -} - -static Tensor irfft(const Tensor& self, const int64_t signal_ndim, const bool normalized, - const bool onesided, IntArrayRef signal_sizes) { - return _fft(self, signal_ndim, /* complex_input */ true, - /* complex_output */ false, /* inverse */ true, signal_sizes, - normalized ? fft_norm_mode::by_root_n : fft_norm_mode::by_n, - onesided); -} - template static Stream& write_opt(Stream& SS, const optional& value) { if (value) { diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index de807c8c5300..3ad0c06c69fc 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -589,112 +589,5 @@ Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization return output; } -// cuFFT -// Currently not utilizing multi GPUs so this can be potentially sped up. -Tensor _fft_cufft(const Tensor& self, int64_t signal_ndim, - bool complex_input, bool complex_output, bool inverse, - IntArrayRef checked_signal_sizes, int64_t normalization, bool onesided, - IntArrayRef output_sizes) { - - CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(self.device().index()); - - Tensor input = self; - const auto fft_type = GetCuFFTTransformType(complex_input, complex_output); - - if (complex_input) { - TORCH_CHECK(input.size(-1) == 2, "Expected a complex (size 2) last dimension"); - } - - - // Slice when twosided complex-to-real. This is not always needed because we - // calculate the inembed. But it will benefit us in certain cases where we - // clone the input tensor. - // - // See NOTE [ cuFFT Embedded Strides ]. - // See NOTE [ Fourier Transform Conjugate Symmetry ] in native/SpectralOpsUtils.h. - if (fft_type == CuFFTTransformType::C2R && !onesided) { - auto onesided_size = infer_ft_real_to_complex_onesided_size(checked_signal_sizes[signal_ndim - 1]); - input = input.narrow(signal_ndim, 0, onesided_size); - } - - // cuFFT requires input and output data pointers to complex type aligned. - // Our newly allocated output tensor is always 512 bytes aligned so it is fine - // (see kRoundSmall and kRoundLarge in THCCachingAllocator.cpp), but we do - // need to check input tensor to make sure that it is not unaligned, e.g., - // from a slicing. - bool must_clone = false; - auto complex_size_bytes = 2 * input.element_size(); - if (reinterpret_cast(input.data_ptr()) % complex_size_bytes != 0) { - must_clone = true; - } - - if (complex_input) { - auto strides = input.strides(); - // Real/imag dimension must be like complex type. - must_clone |= strides.back() != 1; - // Strides of other dimensions needs to be aligned when viewed as complex - // type, i.e., multiples of 2. - must_clone |= std::any_of(strides.begin(), strides.end() - 1, - [&](int64_t stride) { return stride % 2 != 0; }); - - // Complex to real FFTs may overwrite the input buffer (gh-34551) - must_clone |= !complex_output; - } - - if (must_clone) { - input = input.clone(MemoryFormat::Contiguous); - } - - // Now that we have done error check and data_ptr checks, we delegate all - // further cuFFT parameter computation and plan creation to the helper class - // CuFFTConfig in CuFFTPlanCache.h. - - // If plan caching is enabled, we check the cache. Note that this accesses - // plan_cache.max_size() and thus makes this function less functional. - // However, integrating additional arguments into the "public" level c++ APIs, - // e.g., irfft, is difficult as we have a long call sequence looking like - // irfft --> _fft --> _fft_with_size --dispatching-to-> _fft_cufft - - DimVector in_strides(signal_ndim + 1); - auto input_strides = input.strides(); - for (int64_t i = signal_ndim; i >= 0; --i) { - in_strides[i] = complex_input ? input_strides[i] / 2 : input_strides[i]; - } - - DimVector out_strides(signal_ndim + 1); - out_strides[signal_ndim] = 1; - if (fft_type == CuFFTTransformType::R2C && onesided) { - out_strides[signal_ndim - 1] = checked_signal_sizes[signal_ndim - 1] / 2 + 1; - } else { - out_strides[signal_ndim - 1] = checked_signal_sizes[signal_ndim - 1]; - } - for (int64_t i = signal_ndim - 2; i >= 0; --i) { - out_strides[i] = out_strides[i + 1] * checked_signal_sizes[i]; - } - - DimVector full_sizes(signal_ndim + 1); - full_sizes[0] = self.size(0); - std::copy(checked_signal_sizes.begin(), checked_signal_sizes.end(), full_sizes.begin() + 1); - CuFFTParams Params(in_strides, out_strides, full_sizes, fft_type, - c10::toValueType(input.scalar_type())); - - // This read is not locked for perf reason. Shouldn't matter too much because - // we check again after acquiring the lock. - if (plan_cache.max_size() > 0) { - std::lock_guard guard(plan_cache.mutex); - if (plan_cache.max_size() > 0) { // check again after acquiring the lock - const CuFFTConfig &config = plan_cache.lookup(Params); - return _run_cufft(config, input, signal_ndim, complex_input, - complex_output, inverse, checked_signal_sizes, - static_cast(normalization), - onesided, output_sizes, must_clone); - } - } - CuFFTConfig config(Params); - return _run_cufft(config, input, signal_ndim, complex_input, - complex_output, inverse, checked_signal_sizes, - static_cast(normalization), - onesided, output_sizes, must_clone); -} }} // at::native diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index 9584fafcea4b..8fca9ad9ecdf 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -9,14 +9,6 @@ namespace at { namespace native { REGISTER_NO_CPU_DISPATCH(fft_fill_with_conjugate_symmetry_stub, fft_fill_with_conjugate_symmetry_fn); -Tensor _fft_mkl(const Tensor& input, int64_t signal_ndim, - bool complex_input, bool complex_output, - bool inverse, IntArrayRef checked_signal_sizes, - int64_t normalization, bool onesided, - IntArrayRef output_sizes) { - AT_ERROR("fft: ATen not compiled with MKL support"); -} - Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) { AT_ERROR("fft: ATen not compiled with MKL support"); } @@ -280,97 +272,6 @@ static DftiDescriptor _plan_mkl_fft( return descriptor; } -// MKL DFTI -Tensor _fft_mkl(const Tensor& self, int64_t signal_ndim, - bool complex_input, bool complex_output, - bool inverse, IntArrayRef checked_signal_sizes, - int64_t normalization, bool onesided, - IntArrayRef output_sizes) { - Tensor input = self; - bool need_contiguous = false; - // real/imag dimension must aligned when viewed as of complex type - if (complex_input) { - need_contiguous |= input.stride(-1) != 1; - for (int64_t i = 0; !need_contiguous && i <= signal_ndim; i++) { - need_contiguous |= input.stride(i) % 2 != 0; - } - } - - // check if we can use MKL because MKL_LONG is 32bit on some OS, e.g. Windows - // need to check input and output size and strides - // be careful about complex domain, where the stride needs to be divided by 2 - // only need to test upper bound MKL_LONG_MAX as these values are non-negative - if (sizeof(MKL_LONG) < sizeof(int64_t)) { - int64_t inumel = 1 /* istride if we contiguous-fy */, onumel = 1; - int64_t isize, osize, istride, ostride; - for (int64_t i = signal_ndim; i >= 0; i--) { - isize = input.size(i); - osize = output_sizes[i]; - istride = complex_input ? input.stride(i) >> 1 : input.stride(i); - ostride = onumel; - TORCH_CHECK(isize <= MKL_LONG_MAX && osize <= MKL_LONG_MAX && ostride <= MKL_LONG_MAX, - "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); - if (!need_contiguous && istride > MKL_LONG_MAX) { - // If we didn't plan to contiguous-fy but the `istride` exceeds bound, - // check if we can stride (equal to `inumel`) get back within bound if - // we contiguous-fy. If so, then we need to always check `inumel` - // instead for the remaining iterations. The iterations before this are - // fine as `inumel` is non-decreasing. - need_contiguous = true; - } - TORCH_CHECK(!need_contiguous || inumel <= MKL_LONG_MAX, - "MKL FFT: input signal numel exceeds allowed range [1 ~ ", MKL_LONG_MAX, "]"); - inumel *= isize; - onumel *= osize; - } - } - - if (need_contiguous) { - input = input.contiguous(); - } - - - Tensor output = at::empty(output_sizes, input.options()); - - DimVector full_sizes(signal_ndim + 1); - full_sizes[0] = self.size(0); - std::copy(checked_signal_sizes.cbegin(), checked_signal_sizes.cend(), full_sizes.begin() + 1); - - // If "complex" is true, convert strides from complex viewed as real to complex strides. - // Otherwise, returns a copy of strides if "complex" is false. - auto convert_strides = [signal_ndim](IntArrayRef strides, bool complex) { - DimVector res(signal_ndim + 1); - if (complex) { - for (int64_t i = 0; i < res.size(); ++i) { - res[i] = strides[i] / 2; - } - } else { - res.assign(strides.cbegin(), strides.cend()); - } - return res; - }; - const auto in_strides = convert_strides(input.strides(), complex_input); - const auto out_strides = convert_strides(output.strides(), complex_output); - - auto descriptor = _plan_mkl_fft( - in_strides, out_strides, full_sizes, complex_input, complex_output, - normalization, !inverse, input.scalar_type()); - - if (inverse) { - MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), input.data_ptr(), output.data_ptr())); - } else { - MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), input.data_ptr(), output.data_ptr())); - } - // now if needed, fill out the other half using Hermitian symmetry dim - if (!complex_input && complex_output && !onesided) { - DimVector signal_dims(signal_ndim); - std::iota(signal_dims.begin(), signal_dims.end(), 1); - auto out_as_complex = at::view_as_complex(output); - at::native::_fft_fill_with_conjugate_symmetry_(out_as_complex, signal_dims); - } - return output; -} - // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r) static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, IntArrayRef dim, int64_t normalization, bool forward) { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8a30507203ae..189d3d7c5cf7 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2054,17 +2054,6 @@ dispatch: CPU, CUDA: native_group_norm_backward -- func: _fft_with_size(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, bool normalized, bool onesided, int[] output_sizes) -> Tensor - use_c10_dispatcher: full - variants: function - -- func: _fft_with_size.norm_modes(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, int normalization, bool onesided, int[] output_sizes) -> Tensor - use_c10_dispatcher: full - variants: function - dispatch: - CPU: _fft_mkl - CUDA: _fft_cufft - # Real to complex forward FFT - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor use_c10_dispatcher: full diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index e155537d7b99..deb7a161e1d3 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -189,6 +189,7 @@ ("aten::rfft", datetime.date(2021, 1, 31)), ("aten::quantile", datetime.date(2021, 1, 31)), ("aten::nanquantile", datetime.date(2021, 1, 31)), + ("aten::_fft_with_size", datetime.date(2021, 1, 31)), ] def allow_listed(schema, allow_list): diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index b88596c2b609..8791dfa7b095 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -1829,9 +1829,6 @@ grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector(padding.size(), 0), groups, false, false, false, false, grad_input_mask) # fft -- name: _fft_with_size.norm_modes(Tensor self, int signal_ndim, bool complex_input, bool complex_output, bool inverse, int[] checked_signal_sizes, int normalization, bool onesided, int[] output_sizes) -> Tensor - self: fft_backward(self, grad, signal_ndim, complex_input, complex_output, inverse, checked_signal_sizes, normalization, onesided, output_sizes) - - name: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor self: fft_r2c_backward(grad, dim, normalization, onesided, self.size(dim.back())) diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 6da1a7e5e934..ed08e541661b 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -2315,94 +2315,6 @@ std::tuple cholesky_solve_backward( return std::tuple{grad_self, grad_input2}; } -// Generally speaking, fft's backward is ifft. -Tensor fft_backward(const Tensor& self, const Tensor& grad, int64_t signal_ndim, - bool complex_input, bool complex_output, - bool inverse, IntArrayRef checked_signal_sizes, - int64_t normalization, bool onesided, - IntArrayRef output_sizes) { - Tensor gI; - if (!complex_input && complex_output) { - // Forward is R2C - // Do inverse C2C and project onto real plane because grad can be - // asymmetrical so C2R can't be used. - if (onesided) { - // Forward is R2C (onesided) - // Think of onesided R2C rfft as - // 1. view as complex numbers (fill complex dim with zeros) - // 2. C2C fft - // 3. discard half of results - // So backward is - // 1. fill the other half with zeros (with `zero_grad_shape` below) - // (C2C ifft only take twosided inputs so we need to fill here) - // 2. inverse C2C ifft - // 3. discard the complex dim - int64_t zero_length = checked_signal_sizes[signal_ndim - 1] - grad.size(signal_ndim); - auto complex_full_grad = grad; - if (zero_length > 0) { - std::vector zero_grad_shape(signal_ndim + 2); - zero_grad_shape[0] = self.size(0); - for (int64_t i = 1; i < signal_ndim; i++) { - zero_grad_shape[i] = checked_signal_sizes[i - 1]; - } - zero_grad_shape[signal_ndim] = zero_length; - zero_grad_shape[signal_ndim + 1] = 2; - complex_full_grad = at::cat({ grad, at::zeros(zero_grad_shape, grad.options()) }, signal_ndim); - } - gI = _fft_with_size(complex_full_grad, signal_ndim, - /* complex_input */ true, /* complex_output */ true, - !inverse, checked_signal_sizes, normalization, - /* onesided */ false, complex_full_grad.sizes()).select(-1, 0); - } else { - gI = _fft_with_size(grad, signal_ndim, /* complex_input */ true, - /* complex_output */ true, !inverse, - checked_signal_sizes, normalization, - /* onesided */ false, grad.sizes()).select(-1, 0); - } - } else if (complex_input && !complex_output && onesided) { - // Forward is C2R (onesided) - // Think of onesided C2R irfft as - // 1. fill the other half by conjugate symmetry - // 2. inverse C2C ifft - // 3. discard the complex dimension - // So backward is - // 1. R2C rfft (essentially add dummy complex dimension, and dft) - // 2. accumulate gradient by conjugate symmetry - // since rfft results follow conjugate symmetry, we only need to - // double some entries from onesided rfft results, i.e., the ones with - // their reflected indices also landing out of the onesided range. So - // consider the index of last dim: - // i. idx = 0. - // Reflected to (N - 0) % N = 0. Not doubled. - // ii 0 < idx < floor(N/2) (last). - // N > N - idx > ceil(N/2) - // Reflected to () - // iii. idx = floor(N/2) = N/2 (last) when N even. - // Reflected to (N - N/2) % N = N/2. Not doubled. - // iv. idx = floor(N/2) = (N-1)/2 (last) when N odd. - // Reflected to (N - (N-1)/2) % N = (N+1)/2. Doubled. - // Therefore, needs to double - // idx = 1, 2, ..., N/2 - 1 when N even - // idx = 1, 2, ..., (N-1)/2 when N odd - // that is - // idx = 1, 2, ..., N - (floor(N/2) + 1) - // = 1, 2, ..., N - onesided_length - gI = _fft_with_size(grad, signal_ndim, /* complex_input */ false, - /* complex_output */ true, /* inverse */ false, - checked_signal_sizes, normalization, /* onesided */ true, - self.sizes()); - int64_t double_length = checked_signal_sizes[signal_ndim - 1] - self.size(signal_ndim); - if (double_length > 0) { // also covers case when signal size is zero - gI.narrow(signal_ndim, 1, double_length).mul_(2); - } - } else { - gI = _fft_with_size(grad, signal_ndim, complex_output, complex_input, - !inverse, checked_signal_sizes, normalization, onesided, - self.sizes()); - } - return gI; -} - Tensor fft_c2r_backward(const Tensor& grad, IntArrayRef dim, int64_t normalization) { // Forward is C2R (onesided) // Think of onesided C2R irfft as From f54ab8fbfe0beedb70f378ba2f5254d8f37ef870 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Dec 2020 07:31:50 -0800 Subject: [PATCH 218/250] Revert "Revert D25003113: make validate debug-only in Device copy ctr" (#49123) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49123 This reverts commit 7a4a2df2254b78d8c8d42b9f81b5b261a617466e. Test Plan: Imported from OSS Reviewed By: ezyang Differential Revision: D25463531 Pulled By: bdhirsh fbshipit-source-id: 7c7ecdc1d63ffd137b84a129887c424b2083a958 --- c10/core/Device.h | 8 ++++++-- test/test_torch.py | 4 ---- .../_internal/distributed/nn/api/remote_module_test.py | 10 ---------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/c10/core/Device.h b/c10/core/Device.h index 7827119bb0ac..04cd711c37b2 100644 --- a/c10/core/Device.h +++ b/c10/core/Device.h @@ -93,9 +93,13 @@ struct C10_API Device final { DeviceType type_; DeviceIndex index_ = -1; void validate() { - TORCH_CHECK(index_ == -1 || index_ >= 0, + // Removing these checks in release builds noticeably improves + // performance in micro-benchmarks. + // This is safe to do, because backends that use the DeviceIndex + // have a later check when we actually try to switch to that device. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(index_ == -1 || index_ >= 0, "Device index must be -1 or non-negative, got ", (int)index_); - TORCH_CHECK(!is_cpu() || index_ <= 0, + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!is_cpu() || index_ <= 0, "CPU device index must be -1 or zero, got ", (int)index_); } }; diff --git a/test/test_torch.py b/test/test_torch.py index d2566a90f382..a8f87c6f2036 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -341,9 +341,6 @@ def test_device(self): self.assertEqual(90, cuda90.index) self.assertRaises(RuntimeError, lambda: torch.device('cpu:-1')) - self.assertRaises(RuntimeError, lambda: torch.device('cpu:1')) - self.assertRaises(RuntimeError, lambda: torch.device('cpu', -1)) - self.assertRaises(RuntimeError, lambda: torch.device('cpu', 1)) self.assertRaises(RuntimeError, lambda: torch.device('cuda:-1')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2 ')) self.assertRaises(RuntimeError, lambda: torch.device('cuda: 2')) @@ -356,7 +353,6 @@ def test_device(self): self.assertRaises(RuntimeError, lambda: torch.device('cuda:2 cuda:3')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2+cuda:3')) self.assertRaises(RuntimeError, lambda: torch.device('cuda:2cuda:3')) - self.assertRaises(RuntimeError, lambda: torch.device('cuda', -1)) self.assertRaises(RuntimeError, lambda: torch.device(-1)) self.assertRaises(RuntimeError, lambda: torch.device('other')) diff --git a/torch/testing/_internal/distributed/nn/api/remote_module_test.py b/torch/testing/_internal/distributed/nn/api/remote_module_test.py index 376fdb8049b9..4f14584af3b1 100644 --- a/torch/testing/_internal/distributed/nn/api/remote_module_test.py +++ b/torch/testing/_internal/distributed/nn/api/remote_module_test.py @@ -285,16 +285,6 @@ def test_invalid_devices(self): ) ) - with self.assertRaisesRegex( - RuntimeError, r"CPU device index must be -1 or zero, got 2" - ): - list( - self._create_remote_module_iter( - "{}/cpu:2".format(dst_worker_name), - modes=[ModuleCreationMode.MODULE_CTOR], - ) - ) - with self.assertRaisesRegex(RuntimeError, r"Device string must not be empty"): list( self._create_remote_module_iter( From eb051afa782d51e97496ac8856ef922fd72200e7 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Dec 2020 08:07:45 -0800 Subject: [PATCH 219/250] [PyTorch] native_cpp_binding for size() and stride() (#49262) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49262 This uses the newly-added native_cpp_binding feature to avoid dispatcher overhead for `size()` and `stride()`. ghstack-source-id: 118480533 Test Plan: CI Reviewed By: bwasti Differential Revision: D25446275 fbshipit-source-id: 1215eaa530d5aa3d501f89da8c99d0a487d8c1b6 --- aten/src/ATen/native/TensorProperties.cpp | 9 ++------- aten/src/ATen/native/native_functions.yaml | 6 ++++-- aten/src/ATen/templates/Functions.h | 8 ++++++++ aten/src/ATen/templates/TensorBody.h | 13 +++++++++++++ 4 files changed, 27 insertions(+), 9 deletions(-) diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 48dab43b2dc8..f395c6956da5 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -14,15 +13,11 @@ bool is_same_size(const Tensor& self, const Tensor& other) { } int64_t size(const Tensor& self, int64_t dim) { - // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) - dim = maybe_wrap_dim(dim, self.dim(), false); - return self.sizes()[dim]; + return self.size(dim); } int64_t stride(const Tensor& self, int64_t dim) { - // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) - dim = maybe_wrap_dim(dim, self.dim(), false); - return self.strides()[dim]; + return self.stride(dim); } int64_t size(const Tensor& self, Dimname dim) { diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 189d3d7c5cf7..768ddf2fc17d 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -3561,8 +3561,9 @@ - func: size.int(Tensor self, int dim) -> int use_c10_dispatcher: full - variants: function, method + variants: function device_guard: False + manual_cpp_binding: True - func: size.Dimname(Tensor self, Dimname dim) -> int variants: function, method @@ -3724,8 +3725,9 @@ - func: stride.int(Tensor self, int dim) -> int use_c10_dispatcher: full - variants: function, method + variants: function device_guard: False + manual_cpp_binding: True - func: stride.Dimname(Tensor self, Dimname dim) -> int variants: function, method diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index 50623dc2dfed..8f5e35d3ea73 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -134,4 +134,12 @@ inline int64_t numel(const Tensor& tensor) { return tensor.numel(); } +inline int64_t size(const Tensor& tensor, int64_t dim) { + return tensor.size(dim); +} + +inline int64_t stride(const Tensor& tensor, int64_t dim) { + return tensor.stride(dim); +} + } diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index 850856b335ac..75f614bb6ea8 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,18 @@ class CAFFE2_API Tensor { } } + int64_t size(int64_t dim) const { + // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) + dim = c10::maybe_wrap_dim(dim, this->dim(), false); + return sizes()[dim]; + } + + int64_t stride(int64_t dim) const { + // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping) + dim = c10::maybe_wrap_dim(dim, this->dim(), false); + return strides()[dim]; + } + TensorImpl * unsafeGetTensorImpl() const { return impl_.get(); } From 7d406b4a0751afdc2bd20d7be0920986178b41ae Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Dec 2020 08:07:45 -0800 Subject: [PATCH 220/250] [PyTorch] Make TORCH_CHECK less likely to interfere with inlining (#49263) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49263 Now it is smaller and calls to an out-of-line function in case of failure. ghstack-source-id: 118480531 Test Plan: 1) Inspect perf profile of internal benchmark, much less time spent in (for example) `c10::impl::getDeviceImpl`, which calls TORCH_CHECK and should be inlined 2) Internal benchmarks Reviewed By: smessmer Differential Revision: D25481308 fbshipit-source-id: 0121ada779ca2518ca717f75920420957b3bb1aa --- c10/util/Exception.cpp | 8 ++++++++ c10/util/Exception.h | 46 +++++++++++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp index 3f2c34ffae6c..1c6363326343 100644 --- a/c10/util/Exception.cpp +++ b/c10/util/Exception.cpp @@ -76,6 +76,14 @@ void Error::add_context(std::string new_msg) { refresh_what(); } +namespace detail { + +void torchCheckFail(const char *func, const char *file, uint32_t line, const std::string& msg) { + throw ::c10::Error({func, file, line}, msg); +} + +} // namespace detail + namespace Warning { namespace { diff --git a/c10/util/Exception.h b/c10/util/Exception.h index fed17a4cf526..ebd1e872251e 100644 --- a/c10/util/Exception.h +++ b/c10/util/Exception.h @@ -194,7 +194,7 @@ C10_API std::string GetExceptionString(const std::exception& e); namespace detail { // Return x if it is non-empty; otherwise return y. -inline std::string if_empty_then(std::string x, std::string y) { +inline std::string if_empty_then(const std::string& x, const std::string& y) { if (x.empty()) { return y; } else { @@ -324,27 +324,45 @@ inline std::string if_empty_then(std::string x, std::string y) { TORCH_CHECK_WITH_MSG(error_t, cond, "", __VA_ARGS__) #ifdef STRIP_ERROR_MESSAGES -#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...) \ - if (C10_UNLIKELY_OR_CONST(!(cond))) { \ - C10_THROW_ERROR(Error, \ - #cond #type " CHECK FAILED at " \ - C10_STRINGIZE(__FILE__) \ - ); \ +#define TORCH_CHECK_MSG(cond, type, ...) \ + (#cond #type " CHECK FAILED at " \ + C10_STRINGIZE(__FILE__)) +#define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + C10_THROW_ERROR(Error, \ + TORCH_CHECK_MSG(cond, type, __VA_ARGS__) \ + ); \ } #else +#define TORCH_CHECK_MSG(cond, type, ...) \ + ::c10::detail::if_empty_then( \ + ::c10::str(__VA_ARGS__), \ + "Expected " #cond " to be true, but got false. " \ + "(Could this error message be improved? If so, " \ + "please report an enhancement request to PyTorch.)" \ + ) #define TORCH_CHECK_WITH_MSG(error_t, cond, type, ...) \ if (C10_UNLIKELY_OR_CONST(!(cond))) { \ C10_THROW_ERROR(error_t, \ - ::c10::detail::if_empty_then( \ - ::c10::str(__VA_ARGS__), \ - "Expected " #cond " to be true, but got false. " \ - "(Could this error message be improved? If so, " \ - "please report an enhancement request to PyTorch.)" \ - ) \ + TORCH_CHECK_MSG(cond, type, __VA_ARGS__) \ ); \ } #endif -#define TORCH_CHECK(cond, ...) TORCH_CHECK_WITH(Error, cond, __VA_ARGS__) + +namespace c10 { +namespace detail { + +[[noreturn]] C10_API void torchCheckFail(const char *func, const char *file, uint32_t line, const std::string& msg); + +} // namespace detail +} // namespace 10 + +#define TORCH_CHECK(cond, ...) \ + if (C10_UNLIKELY_OR_CONST(!(cond))) { \ + ::c10::detail::torchCheckFail( \ + __func__, __FILE__, static_cast(__LINE__), \ + TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \ + } // An utility macro that does what `TORCH_CHECK` does if compiled in the host code, // otherwise does nothing. Supposed to be used in the code shared between host and From bd322c8967b99551df67d479ebcc435c030980c9 Mon Sep 17 00:00:00 2001 From: Kyeongpil Kang Date: Mon, 14 Dec 2020 08:32:59 -0800 Subject: [PATCH 221/250] Update docstrings of torch.nn.modules.activation.MultiheadAttention (#48775) Summary: - Add the link to the original paper (Attention is All You Need) - Fix indentation Pull Request resolved: https://github.com/pytorch/pytorch/pull/48775 Reviewed By: H-Huang Differential Revision: D25465914 Pulled By: heitorschueroff fbshipit-source-id: bbc296ec1523326e323587023c126e820e90ad8d --- torch/nn/modules/activation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index d0cfa5f80512..4b07682b1af7 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -831,7 +831,7 @@ def extra_repr(self) -> str: class MultiheadAttention(Module): r"""Allows the model to jointly attend to information from different representation subspaces. - See reference: Attention Is All You Need + See `Attention Is All You Need `_ .. math:: \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O @@ -849,7 +849,7 @@ class MultiheadAttention(Module): vdim: total number of features in value. Default: None. Note: if kdim and vdim are None, they will be set to embed_dim such that - query, key, and value have the same number of features. + query, key, and value have the same number of features. Examples:: From 8397a62a64e86e16e39e0cb33ef03d0bc4075219 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 14 Dec 2020 08:47:24 -0800 Subject: [PATCH 222/250] Fix cvtfp32_bf16 (#41280) Summary: For `Vec256::blendv()` operator to work correctly, float32 -nan (0xfffffffff) must be converted to bfloat16 -nan (0xffff). But cvtfp32_bf16 converts -nan to nan (0x7fc0) TODO: Fix float32 +-nan conversion: i.e. float32 nan (0x7fffffff) must be converted to bfloat16 (0x7fff) nan Closes https://github.com/pytorch/pytorch/issues/41238 Pull Request resolved: https://github.com/pytorch/pytorch/pull/41280 Reviewed By: mruberry Differential Revision: D23311585 Pulled By: malfet fbshipit-source-id: 79499ce19f1ec3f6c954a874f1cd47f4ece6bdb5 --- aten/src/ATen/cpu/vec256/vec256_bfloat16.h | 2 +- aten/src/ATen/native/cpu/UnaryOpsKernel.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h index 58a677dc5de0..43389fe61583 100644 --- a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h +++ b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h @@ -25,7 +25,7 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) { static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) { __m256i lo = _mm256_castps_si256(a); __m256i hi = _mm256_castps_si256(b); - __m256i nan = _mm256_set1_epi32(0x7fc0); + __m256i nan = _mm256_set1_epi32(0xffff); __m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q)); __m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q)); __m256i ones = _mm256_set1_epi32(0x1); diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index bfb136776333..f7c4f9c34613 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -277,7 +277,7 @@ static void sign_kernel(TensorIterator& iter){ [=](scalar_t a) -> scalar_t { return (0 < a) - (a < 0); }, [=](Vec256 self_vec){ - // Comparision operators returns bitmask. + // Comparison operators returns bitmask. auto left = Vec256::blendv(zero_vec, one_vec, zero_vec < self_vec); auto right = Vec256::blendv(zero_vec, one_vec, self_vec < zero_vec); From 690eaf9c43f3f71051d90e9f47f25b014ce84681 Mon Sep 17 00:00:00 2001 From: mingfeima Date: Mon, 14 Dec 2020 09:46:16 -0800 Subject: [PATCH 223/250] add channels last for AdaptiveAvgPool2d (#48916) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48916 optimize adaptive average pool2d forward path optimize adaptive average pool2d backward path remove unused headers minor change minor change rename the header; add adaptive max pooling in future. minor change loosen adapative_pool2d test on nhwc to both device cuda and cpu minor change Test Plan: Imported from OSS Reviewed By: ngimel Differential Revision: D25399469 Pulled By: VitalyFedyunin fbshipit-source-id: 86f9fda35194f21144bd4667b778c861c05a5bac --- .../ATen/native/AdaptiveAveragePooling.cpp | 322 +++--------------- aten/src/ATen/native/AdaptivePooling.h | 21 ++ .../ATen/native/cpu/AdaptiveAvgPoolKernel.cpp | 311 +++++++++++++++++ aten/src/ATen/native/cpu/UpSampleKernel.cpp | 26 +- aten/src/ATen/native/cpu/utils.h | 30 ++ test/test_nn.py | 76 +++-- 6 files changed, 452 insertions(+), 334 deletions(-) create mode 100644 aten/src/ATen/native/AdaptivePooling.h create mode 100644 aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp create mode 100644 aten/src/ATen/native/cpu/utils.h diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp index 9802797874b9..9778aa035cb1 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp @@ -1,7 +1,6 @@ #include #include -#include -#include +#include namespace at { @@ -9,295 +8,66 @@ namespace native { namespace { - inline int start_index(int a, int b, int c) { - return (int)std::floor((float)(a * c) / b); - } - - inline int end_index(int a, int b, int c) { - return (int)std::ceil((float)((a + 1) * c) / b); - } - - template - static void adaptive_avg_pool2d_single_out_frame( - scalar_t *input_p, - scalar_t *output_p, - int64_t sizeD, - int64_t isizeH, - int64_t isizeW, - int64_t osizeH, - int64_t osizeW, - int64_t istrideD, - int64_t istrideH, - int64_t istrideW) - { - at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { - for (auto d = start; d < end; d++) - { - /* loop over output */ - int64_t oh, ow; - for(oh = 0; oh < osizeH; oh++) - { - int istartH = start_index(oh, osizeH, isizeH); - int iendH = end_index(oh, osizeH, isizeH); - int kH = iendH - istartH; - - for(ow = 0; ow < osizeW; ow++) - { - int istartW = start_index(ow, osizeW, isizeW); - int iendW = end_index(ow, osizeW, isizeW); - int kW = iendW - istartW; - - /* local pointers */ - scalar_t *ip = input_p + d*istrideD + istartH*istrideH + istartW*istrideW; - scalar_t *op = output_p + d*osizeH*osizeW + oh*osizeW + ow; - - /* compute local average: */ - scalar_t sum = 0; - int ih, iw; - for(ih = 0; ih < kH; ih++) - { - for(iw = 0; iw < kW; iw++) - { - scalar_t val = *(ip + ih*istrideH + iw*istrideW); - sum += val; - } - } - - /* set output to local average */ - *op = sum / kW / kH; - } - } - } - }); - } - - template - void adaptive_avg_pool2d_out_frame( - scalar_t *input_p, - scalar_t *output_p, - int64_t sizeB, - int64_t sizeD, - int64_t isizeH, - int64_t isizeW, - int64_t osizeH, - int64_t osizeW, - int64_t istrideB, - int64_t istrideD, - int64_t istrideH, - int64_t istrideW) - { - at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) - { - adaptive_avg_pool2d_single_out_frame( - input_p + b * istrideB, - output_p + b * sizeD * osizeH * osizeW, - sizeD, - isizeH, isizeW, - osizeH, osizeW, - istrideD, - istrideH, istrideW); - } - }); - } - void adaptive_avg_pool2d_out_cpu_template( at::Tensor& output, at::Tensor const& input, IntArrayRef output_size) { TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2"); - for (int64_t i = 0; i < input.ndimension(); i++) { + int64_t ndim = input.ndimension(); + for (int64_t i = 0; i < ndim; i++) { TORCH_CHECK(input.size(i) > 0, "adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " "empty"); } - TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4), + TORCH_CHECK((ndim == 3 || ndim == 4), "non-empty 3D or 4D (batch mode) tensor expected for input"); + TORCH_CHECK(input.dtype() == output.dtype(), + "expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype()); - /* sizes */ - int64_t sizeD = input.size(-3); - int64_t isizeH = input.size(-2); - int64_t isizeW = input.size(-1); - /* strides */ - int64_t istrideD = input.stride(-3); - int64_t istrideH = input.stride(-2); - int64_t istrideW = input.stride(-1); - - auto osizeH = output_size[0]; - auto osizeW = output_size[1]; - - /* resize output */ - if (input.ndimension() == 3 || input.size(-4) == 1) - { - if (input.ndimension() == 3) { - output.resize_({sizeD, osizeH, osizeW}); - } else { - output.resize_({1, sizeD, osizeH, osizeW}); - } - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] { - auto input_data = input.data_ptr(); - auto output_data = output.data_ptr(); - adaptive_avg_pool2d_single_out_frame( - input_data, - output_data, - sizeD, - isizeH, isizeW, - osizeH, osizeW, - istrideD, - istrideH, istrideW); - } - ); - } - else - { - int64_t sizeB = input.size(-4); - output.resize_({sizeB, sizeD, osizeH, osizeW}); - int64_t istrideB = input.stride(-4); + int64_t channels = input.size(-3); + int64_t input_height = input.size(-2); + int64_t input_width = input.size(-1); + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] { - auto input_data = input.data_ptr(); - auto output_data = output.data_ptr(); - adaptive_avg_pool2d_out_frame( - input_data, - output_data, - sizeB, - sizeD, - isizeH, isizeW, - osizeH, osizeW, - istrideB, - istrideD, - istrideH, istrideW); - }); + if (ndim == 3) { + output.resize_({channels, output_height, output_width}); + } else { + int64_t nbatch = input.size(0); + output.resize_({nbatch, channels, output_height, output_width}, input.suggest_memory_format()); } - } - - template - static void adaptive_avg_pool2d_backward_single_out_frame( - scalar_t *gradInput_p, - scalar_t *gradOutput_p, - int64_t sizeD, - int64_t isizeH, - int64_t isizeW, - int64_t osizeH, - int64_t osizeW) - { - at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { - for (auto d = start; d < end; d++) - { - scalar_t *gradInput_p_d = gradInput_p + d*isizeW*isizeH; - scalar_t *gradOutput_p_d = gradOutput_p + d*osizeW*osizeH; - - /* calculate average */ - int64_t oh, ow; - for(oh = 0; oh < osizeH; oh++) - { - int istartH = start_index(oh, osizeH, isizeH); - int iendH = end_index(oh, osizeH, isizeH); - int kH = iendH - istartH; - for(ow = 0; ow < osizeW; ow++) - { - - int istartW = start_index(ow, osizeW, isizeW); - int iendW = end_index(ow, osizeW, isizeW); - int kW = iendW - istartW; - - scalar_t grad_delta = gradOutput_p_d[oh*osizeW +ow] / kH / kW; - - int ih, iw; - for(ih = istartH; ih < iendH; ih++) - { - for(iw = istartW; iw < iendW; iw++) - { - /* update gradient */ - gradInput_p_d[ih*isizeW + iw] += grad_delta; - } - } - } - } - } - }); - } - - template - void adaptive_avg_pool2d_backward_out_frame( - scalar_t *gradInput_p, - scalar_t *gradOutput_p, - int64_t sizeB, - int64_t sizeD, - int64_t isizeH, - int64_t isizeW, - int64_t osizeH, - int64_t osizeW) - { - at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) - { - scalar_t *gradInput_p_d = gradInput_p + b * sizeD * isizeW * isizeH; - scalar_t *gradOutput_p_d = gradOutput_p + b * sizeD * osizeW * osizeH; - adaptive_avg_pool2d_backward_single_out_frame( - gradInput_p_d, - gradOutput_p_d, - sizeD, - isizeH, isizeW, - osizeH, osizeW); - } - }); + adaptive_avg_pool2d_kernel(kCPU, output, input, output_size); } Tensor& adaptive_avg_pool2d_backward_out_cpu_template( - Tensor& gradInput, - const Tensor& gradOutput_, + Tensor& grad_input, + const Tensor& grad_output, const Tensor& input) { - /* sizes */ - int sizeD = input.size(-3); - int isizeH = input.size(-2); - int isizeW = input.size(-1); - int osizeH = gradOutput_.size(-2); - int osizeW = gradOutput_.size(-1); - - /* get contiguous gradOutput */ - auto gradOutput = gradOutput_.contiguous(); + int64_t ndim = grad_output.ndimension(); + for (int64_t i = 0; i < ndim; i++) { + TORCH_CHECK(grad_output.size(i) > 0, + "adaptive_avg_pooling2d_backward(): expected grad_output to have non-empty spatial dimensions, " + "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being " + "empty"); + } - /* backprop */ - if (input.ndimension() == 3 || input.size(-4) == 1) - { - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] { - /* get raw pointers */ - scalar_t *gradInput_data = gradInput.data_ptr(); - scalar_t *gradOutput_data = gradOutput.data_ptr(); + TORCH_CHECK((ndim == 3 || ndim == 4), + "non-empty 3D or 4D (batch mode) tensor expected for grad_output"); + TORCH_CHECK(input.dtype() == grad_output.dtype(), + "expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype()); + TORCH_CHECK(input.dtype() == grad_input.dtype(), + "expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype()); - adaptive_avg_pool2d_backward_single_out_frame( - gradInput_data, gradOutput_data, - sizeD, - isizeH, isizeW, - osizeH, osizeW); - } - ); - } - else - { - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] { - /* get raw pointers */ - scalar_t *gradInput_data = gradInput.data_ptr(); - scalar_t *gradOutput_data = gradOutput.data_ptr(); - int64_t sizeB = input.size(-4); + grad_input.resize_(input.sizes(), input.suggest_memory_format()); + grad_input.zero_(); - adaptive_avg_pool2d_backward_out_frame( - gradInput_data, gradOutput_data, - sizeB, sizeD, - isizeH, isizeW, - osizeH, osizeW); - } - ); - } - return gradInput; + adaptive_avg_pool2d_backward_kernel(kCPU, grad_input, grad_output); + return grad_input; } } // namespace @@ -346,25 +116,27 @@ namespace { } Tensor& adaptive_avg_pool2d_backward_out_cpu( - Tensor& gradInput, - const Tensor& gradOutput, + Tensor& grad_input, + const Tensor& grad_output, const Tensor& input) { - gradInput.resize_as_(input); adaptive_avg_pool2d_backward_out_cpu_template( - gradInput, gradOutput, input); - return gradInput; + grad_input, grad_output, input); + return grad_input; } Tensor adaptive_avg_pool2d_backward_cpu( - const Tensor& gradOutput, + const Tensor& grad_output, const Tensor& input) { - auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto grad_input = at::empty({0}, input.options()); adaptive_avg_pool2d_backward_out_cpu_template( - gradInput, gradOutput, input); - return gradInput; + grad_input, grad_output, input); + return grad_input; } +DEFINE_DISPATCH(adaptive_avg_pool2d_kernel); +DEFINE_DISPATCH(adaptive_avg_pool2d_backward_kernel); + } // at::native } // at diff --git a/aten/src/ATen/native/AdaptivePooling.h b/aten/src/ATen/native/AdaptivePooling.h new file mode 100644 index 000000000000..29b2fd1c94c9 --- /dev/null +++ b/aten/src/ATen/native/AdaptivePooling.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); +using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); +DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel); +DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel); + +static inline int64_t start_index(int64_t a, int64_t b, int64_t c) { + return (int64_t)std::floor((float)(a * c) / b); +} + +static inline int64_t end_index(int64_t a, int64_t b, int64_t c) { + return (int64_t)std::ceil((float)((a + 1) * c) / b); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp new file mode 100644 index 000000000000..b5ed77f6e400 --- /dev/null +++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp @@ -0,0 +1,311 @@ +#include + +#include +#include +#include +#include +#include + +namespace at { namespace native { + +namespace { + +template +void cpu_adaptive_avg_pool( + Tensor& output_, + const Tensor& input_, + IntArrayRef output_size) { + auto input = input_.contiguous(); + auto output = output_.contiguous(); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t ndim = input.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 3 ? input.size(0) : input.size(0) * input.size(1); + int64_t input_height = input.size(-2); + int64_t input_width = input.size(-1); + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (int64_t c = begin; c < end; c++) { + scalar_t* input_ptr = input_data + c * input_height * input_width; + scalar_t* output_ptr = output_data + c * output_height * output_width; + + for (int64_t oh = 0; oh < output_height; oh++) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + for (int64_t ow = 0; ow < output_width; ow++) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + // compute local average + scalar_t sum = 0; + for (int64_t ih = ih0; ih < ih1; ih++) { + for (int64_t iw = iw0; iw < iw1; iw++) { + sum += input_ptr[ih * input_width + iw]; + } + } + output_ptr[oh * output_width + ow] = sum / kh / kw; + } + } + } + }); + + if (!output_.is_contiguous()) { + output_.copy_(output); + } +} + +template +void cpu_adaptive_avg_pool_channels_last( + Tensor& output_, + const Tensor& input_, + IntArrayRef output_size) { + auto memory_format = at::MemoryFormat::ChannelsLast; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_height = input.size(2); + int64_t input_width = input.size(3); + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + + using Vec = vec256::Vec256; + // parallel on dim N, H, W + at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, n, nbatch, oh, output_height, ow, output_width); + + for (int64_t i = begin; i < end; i++) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t* out = output_data + i * channels; + int64_t size = channels; + + // Note: For oridinary usage scenario, each out lane should + // fit in L1 cache; otherwise consider block dim C. + // Pass I: zero the out lane + int64_t d1 = 0; + for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { + Vec out_vec = Vec(scalar_t(0)); + out_vec.store(out + d1); + } + for (; d1 < size; d1++) { + out[d1] = scalar_t(0); + } + // Pass II: compute local sum + for (int64_t ih = ih0; ih < ih1; ih++) { + for (int64_t iw = iw0; iw < iw1; iw++) { + scalar_t* in = input_data + n * input_height * input_width * channels + + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) { + Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2); + out_vec.store(out + d2); + } + for (; d2 < size; d2++) { + out[d2] += in[d2]; + } + } + } + // Pass III: compute local average + int64_t d3 = 0; + for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) { + Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(kh * kw)); + out_vec.store(out + d3); + } + for (; d3 < size; d3++) { + out[d3] = out[d3] / kh / kw; + } + + // move on to next output index + data_index_step(n, nbatch, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } +} + +template +void cpu_adaptive_avg_pool_backward( + Tensor& grad_input_, + const Tensor& grad_output_) { + auto grad_output = grad_output_.contiguous(); + auto grad_input = grad_input_.contiguous(); + + auto grad_output_data = grad_output.data_ptr(); + auto grad_input_data = grad_input.data_ptr(); + + int64_t ndim = grad_output.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1); + int64_t input_height = grad_input.size(-2); + int64_t input_width = grad_input.size(-1); + int64_t output_height = grad_output.size(-2); + int64_t output_width = grad_output.size(-1); + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (int64_t c = begin; c < end; c++) { + scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; + scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; + + for (int64_t oh = 0; oh < output_height; oh++) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + for (int64_t ow = 0; ow < output_width; ow++) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t grad_delta = grad_output_ptr[oh * output_width + ow] / kh / kw; + for (int64_t ih = ih0; ih < ih1; ih++) { + for (int64_t iw = iw0; iw < iw1; iw++) { + grad_input_ptr[ih * input_width + iw] += grad_delta; + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } +} + +template +void cpu_adaptive_avg_pool_backward_channels_last( + Tensor& grad_input_, + const Tensor& grad_output_) { + auto memory_format = at::MemoryFormat::ChannelsLast; + auto grad_input = grad_input_.contiguous(memory_format); + auto grad_output = grad_output_.contiguous(memory_format); + + auto grad_input_data = grad_input.data_ptr(); + auto grad_output_data = grad_output.data_ptr(); + + int64_t nbatch = grad_input.size(0); + int64_t channels = grad_input.size(1); + int64_t input_height = grad_input.size(2); + int64_t input_width = grad_input.size(3); + int64_t output_height = grad_output.size(2); + int64_t output_width = grad_output.size(3); + + using Vec = vec256::Vec256; + // parallel on dim N + at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { + for (int64_t n = begin; n < end; n++) { + scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; + scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; + + for (int64_t oh = 0; oh < output_height; oh++) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + for (int64_t ow = 0; ow < output_width; ow++) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; + int64_t size = channels; + for (int64_t ih = ih0; ih < ih1; ih++) { + for (int64_t iw = iw0; iw < iw1; iw++) { + scalar_t* gin = grad_input_ptr + ih * input_width * channels + iw * channels; + + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(kh * kw)); + gin_vec.store(gin + d); + } + for (; d < size; d++) { + gin[d] += gout[d] / kw / kw; + } + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous(memory_format)) { + grad_input_.copy_(grad_input); + } +} + +void adaptive_avg_pool2d_kernel_impl( + Tensor& output, + const Tensor& input, + IntArrayRef output_size) { + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_avg_pool2d", [&] { + cpu_adaptive_avg_pool(output, input, output_size); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "adaptive_avg_pool2d_channels_last", [&]{ + cpu_adaptive_avg_pool_channels_last(output, input, output_size); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +void adapative_avg_pool2d_backward_kernel_impl( + Tensor& grad_input, + const Tensor& grad_output) { + switch (grad_output.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "adaptive_avg_pool2d_backward", [&] { + cpu_adaptive_avg_pool_backward(grad_input, grad_output); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_FLOATING_TYPES(grad_output.scalar_type(), "adaptive_avg_pool2d_backward_channels_last", [&]{ + cpu_adaptive_avg_pool_backward_channels_last(grad_input, grad_output); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +} // anonymous namespace + +REGISTER_DISPATCH(adaptive_avg_pool2d_kernel, &adaptive_avg_pool2d_kernel_impl); +REGISTER_DISPATCH(adaptive_avg_pool2d_backward_kernel, &adapative_avg_pool2d_backward_kernel_impl); + +}} // at::native diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index aa6d57cdd2df..61e7877761d8 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -4,36 +4,12 @@ #include #include #include +#include namespace at { namespace native { namespace { -template -inline T data_index_init(T offset) { - return offset; -} - -template -inline T data_index_init(T offset, T &x, const T &X, Args &&... args) { - offset = data_index_init(offset, std::forward(args)...); - x = offset % X; - return offset / X; -} - -inline bool data_index_step() { - return true; -} - -template -inline bool data_index_step(T &x, const T &X, Args &&... args) { - if (data_index_step(std::forward(args)...)) { - x = ((x + 1) == X) ? 0 : (x + 1); - return x == 0; - } - return false; -} - static inline int64_t nearest_idx( int64_t output_index, int64_t input_size, diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h new file mode 100644 index 000000000000..32d1de5adb51 --- /dev/null +++ b/aten/src/ATen/native/cpu/utils.h @@ -0,0 +1,30 @@ +#pragma once + +namespace at { namespace native { namespace { + +template +inline T data_index_init(T offset) { + return offset; +} + +template +inline T data_index_init(T offset, T &x, const T &X, Args &&... args) { + offset = data_index_init(offset, std::forward(args)...); + x = offset % X; + return offset / X; +} + +inline bool data_index_step() { + return true; +} + +template +inline bool data_index_step(T &x, const T &X, Args &&... args) { + if (data_index_step(std::forward(args)...)) { + x = ((x + 1) == X) ? 0 : (x + 1); + return x == 0; + } + return false; +} + +}}} // namespace at::native:: diff --git a/test/test_nn.py b/test/test_nn.py index 67412d54eed9..2291acbc957a 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -3587,49 +3587,57 @@ def test_adaptive_pooling_size_none(self): output = module(input) self.assertEqual(output.size(), (4,) + (2,) * (numel - 1) + (4,)) - @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_adaptive_pooling_avg_nhwc(self): - input = torch.randint(1, 10, (4, 8, 8, 8), dtype=torch.float32, device="cuda") - input = input.contiguous(memory_format=torch.channels_last).requires_grad_() - grad = torch.randint(1, 10, (4, 8, 7, 7), dtype=torch.float32, device="cuda") - pool = torch.nn.AdaptiveAvgPool2d((7, 7)).cuda() + device_list = ['cpu'] + if TEST_CUDA: + device_list.append('cuda') - ref_input = input.detach().clone().contiguous().requires_grad_(True) - ref_grad = grad.detach().clone().contiguous() - ref_pool = torch.nn.AdaptiveAvgPool2d((7, 7)).cuda() + for device in device_list: + input = torch.randint(1, 10, (4, 8, 8, 8), dtype=torch.float32).to(device) + input = input.contiguous(memory_format=torch.channels_last).requires_grad_() + grad = torch.randint(1, 10, (4, 8, 7, 7), dtype=torch.float32).to(device) + pool = torch.nn.AdaptiveAvgPool2d((7, 7)).to(device) - out = pool(input) - out.backward(grad) - ref_out = ref_pool(ref_input) - ref_out.backward(ref_grad) + ref_input = input.detach().clone().contiguous().requires_grad_(True) + ref_grad = grad.detach().clone().contiguous() + ref_pool = torch.nn.AdaptiveAvgPool2d((7, 7)).to(device) - self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) - self.assertTrue(ref_out.is_contiguous()) - self.assertEqual(out, ref_out) - self.assertEqual(input.grad, ref_input.grad) + out = pool(input) + out.backward(grad) + ref_out = ref_pool(ref_input) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertEqual(out, ref_out) + self.assertEqual(input.grad, ref_input.grad) - @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") def test_adaptive_pooling_avg_nhwc_non_contiguous(self): - input = torch.randint(1, 10, (4, 8, 8, 8), dtype=torch.float32, device="cuda") - input = input.contiguous(memory_format=torch.channels_last) - input = input[:, ::2, :, :].requires_grad_() - grad = torch.randint(1, 10, (4, 8, 7, 7), dtype=torch.float32, device="cuda") - grad = grad[:, ::2, :, :] - pool = torch.nn.AdaptiveAvgPool2d((7, 7)).cuda() + device_list = ['cpu'] + if TEST_CUDA: + device_list.append('cuda') - ref_input = input.detach().clone().contiguous().requires_grad_(True) - ref_grad = grad.detach().clone().contiguous() - ref_pool = torch.nn.AdaptiveAvgPool2d((7, 7)).cuda() + for device in device_list: + input = torch.randint(1, 10, (4, 8, 8, 8), dtype=torch.float32).to(device) + input = input.contiguous(memory_format=torch.channels_last) + input = input[:, ::2, :, :].requires_grad_() + grad = torch.randint(1, 10, (4, 8, 7, 7), dtype=torch.float32).to(device) + grad = grad[:, ::2, :, :] + pool = torch.nn.AdaptiveAvgPool2d((7, 7)).to(device) - out = pool(input) - out.backward(grad) - ref_out = ref_pool(ref_input) - ref_out.backward(ref_grad) + ref_input = input.detach().clone().contiguous().requires_grad_(True) + ref_grad = grad.detach().clone().contiguous() + ref_pool = torch.nn.AdaptiveAvgPool2d((7, 7)).to(device) - self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) - self.assertTrue(ref_out.is_contiguous()) - self.assertEqual(out, ref_out) - self.assertEqual(input.grad, ref_input.grad) + out = pool(input) + out.backward(grad) + ref_out = ref_pool(ref_input) + ref_out.backward(ref_grad) + + self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) + self.assertTrue(ref_out.is_contiguous()) + self.assertEqual(out, ref_out) + self.assertEqual(input.grad, ref_input.grad) @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @largeTensorTest('12GB', device='cuda') From 87636c07bb95e12b37134f90c08f563ecb47dc0e Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 14 Dec 2020 09:53:59 -0800 Subject: [PATCH 224/250] CUDA BF16 sparse (#48807) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/48807 Reviewed By: mruberry Differential Revision: D25526752 Pulled By: ngimel fbshipit-source-id: 9ff8e637486cfd67d46daf0c05142bbe611e08ec --- .../native/sparse/cuda/SparseCUDATensor.cu | 22 +++++----- .../sparse/cuda/SparseCUDATensorMath.cu | 40 ++++++++----------- test/test_sparse.py | 3 +- 3 files changed, 27 insertions(+), 38 deletions(-) diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu index 5d25138500d7..660862181262 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cu @@ -96,18 +96,16 @@ SparseTensor coalesce_sparse_cuda(const SparseTensor& self) { dim3 block(C10_WARP_SIZE, SZ); AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, values.scalar_type(), "coalesce_sparse_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "coalesce_sparse_cuda", [&] { - using cuda_accscalar_t = acc_type; - apply::coalesceValuesKernel<<>>( - uniqueOffsets.data_ptr(), - origIndices.data_ptr(), - values.data_ptr(), - newValues.data_ptr(), - nnz, - newNnz, - stride - ); - }); + using cuda_accscalar_t = acc_type; + apply::coalesceValuesKernel<<>>( + uniqueOffsets.data_ptr(), + origIndices.data_ptr(), + values.data_ptr(), + newValues.data_ptr(), + nnz, + newNnz, + stride + ); }); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 81058ec266f2..d0aafe680efb 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -340,13 +340,11 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "add_out_dense_sparse_cuda", [&] { - apply::sparseElementwiseKernelScalar, uint64_t, scalar_t> - <<>>( - TensorCAddOp(value.to()), - V_INFO(r), I_INFO(indices), V_INFO(values), - static_cast(nnz)); - }); + apply::sparseElementwiseKernelScalar, uint64_t, scalar_t> + <<>>( + TensorCAddOp(value.to()), + V_INFO(r), I_INFO(indices), V_INFO(values), + static_cast(nnz)); }); } else { TORCH_CHECK(cuda::getApplyGrid(nnz * block.x, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); @@ -356,13 +354,11 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "add_out_dense_sparse_cuda", [&] { - apply::sparseElementwiseKernel, uint64_t, scalar_t> - <<>>( - TensorCAddOp(value.to()), - V_INFO(r), I_INFO(indices), V_INFO(values), - static_cast(nnz)); - }); + apply::sparseElementwiseKernel, uint64_t, scalar_t> + <<>>( + TensorCAddOp(value.to()), + V_INFO(r), I_INFO(indices), V_INFO(values), + static_cast(nnz)); }); } } else { @@ -373,11 +369,9 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT // NB: Purposely not inplace! AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "add_out_dense_sparse_cuda", [&] { - if (value.to() != static_cast(1)) { - values = values.mul(value); - } - }); + if (value.to() != static_cast(1)) { + values = values.mul(value); + } }); int64_t view_rows = 1; @@ -445,11 +439,9 @@ SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const AT_DISPATCH_ALL_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_sparse_cuda", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "add_out_sparse_cuda", [&] { - if (value.to() != static_cast(1)) { - s_values_ = s_values_.mul(value); - } - }); + if (value.to() != static_cast(1)) { + s_values_ = s_values_.mul(value); + } }); LongTensor r_indices_ = at::cat({t_indices_, s_indices_}, 1); Tensor r_values_ = at::cat({t_values_, s_values_}, 0); diff --git a/test/test_sparse.py b/test/test_sparse.py index 72a67caa2038..5af630c0acb4 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -10,7 +10,7 @@ import random import unittest from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \ - do_test_empty_full, load_tests, TEST_NUMPY, TEST_WITH_ROCM, IS_WINDOWS + do_test_empty_full, load_tests, TEST_NUMPY, IS_WINDOWS from torch.testing._internal.common_cuda import TEST_CUDA, _get_torch_cuda_version from numbers import Number from torch.autograd.gradcheck import gradcheck @@ -1301,7 +1301,6 @@ def test_spadd_hybrid(self): self._test_spadd_shape(10, [50, 30, 20], [2, 0]) @cuda_only - @unittest.skipIf(not TEST_WITH_ROCM, "runs only on ROCm") def test_sparse_add_out_bfloat16(self): # fp32 x, _, _ = self._gen_sparse(3, 5, 10) From a0432a7020c229bce76aa22d76e57859df0d43e4 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Mon, 14 Dec 2020 10:07:56 -0800 Subject: [PATCH 225/250] [AARCH64] Fix vst1q_f32_x2 implementation (#49273) Summary: Add memory operands to inline asm, that informs the compiler that this instruction writes to memory. Fixes https://github.com/pytorch/pytorch/issues/48901 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49273 Reviewed By: walterddr Differential Revision: D25512921 Pulled By: malfet fbshipit-source-id: 474d070e1f7c2167b9958cbeb4e401dc0e4a930b --- aten/src/ATen/cpu/vec256/missing_vst1_neon.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aten/src/ATen/cpu/vec256/missing_vst1_neon.h b/aten/src/ATen/cpu/vec256/missing_vst1_neon.h index dbb2ba479f85..dffd5dbb862e 100644 --- a/aten/src/ATen/cpu/vec256/missing_vst1_neon.h +++ b/aten/src/ATen/cpu/vec256/missing_vst1_neon.h @@ -4,6 +4,5 @@ __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_f32_x2 (float32_t * __a, float32x4x2_t val) { - asm ("st1 {%S0.4s - %T0.4s}, [%1]" :: "w" (val), "r"(__a) :); + asm ("st1 {%S1.4s - %T1.4s}, [%2]" : "=m" (*__a) : "w" (val), "r"(__a) : "memory"); } - From 25833e5d1cba1ba3a8631d4014c37879337d00dc Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Mon, 14 Dec 2020 10:25:44 -0800 Subject: [PATCH 226/250] [CrashFix] Make the dst tensor contiguous when copying from metal Summary: Somehow the destination tensor becomes incontiguous when copying from Metal. We need to call `.contiguous()` explicitly. See the crash log - https://www.internalfb.com/intern/logview/details/facebook_ios_crashes/1d865405fbc1a45f9517470906c9ec08/ Test Plan: - verify the crash - Sandcastle CIs Reviewed By: dreiss Differential Revision: D25502884 fbshipit-source-id: 46ee720bf6b6658e51cb56a4e4c16ce121eeabc7 --- aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm index 8dd27aa1c3ed..9b1ff29feaa1 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm +++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm @@ -608,7 +608,9 @@ Tensor copy_to_host(const Tensor& input) { MPSImage* X = imageFromTensor(input); MetalCommandBuffer* commandBuffer = commandBufferFromInputTensor(input); auto&& sizes = [X sizes]; - MetalTensor mt{sizes}; + auto dummy = at::zeros(input.sizes()).contiguous(); + auto strides = dummy.strides(); + MetalTensor mt{sizes, strides.vec()}; mt.texture()->setCommandBuffer(commandBuffer); mt.texture()->allocateTextureStorage(sizes); MPSImage* Y = imageFromMetalTensor(mt); From c068180a176fb7d67add6c52ce5b50ff544fb5e5 Mon Sep 17 00:00:00 2001 From: Michael Carilli Date: Mon, 14 Dec 2020 10:49:57 -0800 Subject: [PATCH 227/250] [CUDA graphs] Cuda RNG-safe graph capture and replay bindings (#48875) Summary: Part 2 of https://github.com/pytorch/pytorch/pull/46148 refactor. (part 1 was https://github.com/pytorch/pytorch/pull/48694.) Contains - a few more CUDAGeneratorImpl diffs to clean up graph capture interaction - Capture and replay bindings that interact correctly with CUDAGeneratorImpl - Tests. Diffs compile and tests pass on my machine (ubuntu 20.04, cuda 11.0) but it needs finetuning for many CI builds. See [Note [CUDA Graph-safe RNG states]](https://github.com/pytorch/pytorch/blob/02d89f9f1d7f32ebf7ec509d5c14b2f39690997a/aten/src/ATen/CUDAGeneratorImpl.h#L13-L85) for the strategy, based on https://github.com/pytorch/pytorch/pull/46148#issuecomment-724414794. Pull Request resolved: https://github.com/pytorch/pytorch/pull/48875 Reviewed By: zou3519 Differential Revision: D25482654 Pulled By: ngimel fbshipit-source-id: 634dbc4c6c9d7d0d9a62dc81a52d430561f905fe --- BUILD.bazel | 3 +- aten/src/ATen/CUDAGeneratorImpl.h | 5 +- aten/src/ATen/cuda/CUDAGeneratorImpl.cpp | 45 +++-- aten/src/ATen/cuda/CUDAGraph.cpp | 168 +++++++++++++++++++ aten/src/ATen/cuda/CUDAGraph.h | 43 +++++ aten/src/ATen/cuda/detail/CUDAHooks.cpp | 4 +- test/test_cuda.py | 204 +++++++++++++++++++++++ tools/build_variables.bzl | 1 + torch/_C/__init__.pyi.in | 4 + torch/csrc/Module.cpp | 2 + torch/csrc/cuda/Graph.cpp | 46 +++++ torch/cuda/__init__.py | 2 +- torch/cuda/streams.py | 5 +- 13 files changed, 502 insertions(+), 30 deletions(-) create mode 100644 aten/src/ATen/cuda/CUDAGraph.cpp create mode 100644 aten/src/ATen/cuda/CUDAGraph.h create mode 100644 torch/csrc/cuda/Graph.cpp diff --git a/BUILD.bazel b/BUILD.bazel index 5da8edc2c34e..ec5111c5104d 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -339,7 +339,8 @@ filegroup( "aten/src/ATen/cuda/CUDABlas.cpp", "aten/src/ATen/cuda/CUDASolver.cpp", "aten/src/ATen/cuda/CUDAContext.cpp", - "aten/src/ATen/cuda/CUDAGenerator.cpp", + "aten/src/ATen/cuda/CUDAGeneratorImpl.cpp", + "aten/src/ATen/cuda/CUDAGraph.cpp", "aten/src/ATen/cuda/CuSparseHandlePool.cpp", "aten/src/ATen/cuda/CublasHandlePool.cpp", "aten/src/ATen/cuda/CusolverDnHandlePool.cpp", diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h index ec83128c7013..9a9febd01f8e 100644 --- a/aten/src/ATen/CUDAGeneratorImpl.h +++ b/aten/src/ATen/CUDAGeneratorImpl.h @@ -131,8 +131,8 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl { uint64_t seed() override; void set_philox_offset_per_thread(uint64_t offset); uint64_t philox_offset_per_thread(); - void graph_prologue(int64_t* offset_extragraph); - uint64_t graph_epilogue(); + void capture_prologue(int64_t* offset_extragraph); + uint64_t capture_epilogue(); PhiloxCudaState philox_cuda_state(uint64_t increment); // Temporarily accommodates call sites that use philox_engine_inputs. @@ -147,6 +147,7 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl { uint64_t philox_offset_per_thread_ = 0; int64_t* offset_extragraph_; uint32_t offset_intragraph_ = 0; + bool graph_expects_this_gen_ = false; }; namespace cuda { diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index f0db9014163a..8a5e4f48e0c0 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -84,7 +84,7 @@ Generator createCUDAGenerator(DeviceIndex device_index) { */ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index) : c10::GeneratorImpl{Device(DeviceType::CUDA, device_index), - DispatchKeySet(c10::DispatchKey::CUDA)} { + DispatchKeySet(c10::DispatchKey::CUDA)} { at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl"); } @@ -101,20 +101,18 @@ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { } #define CAPTURE_DEFAULT_GENS_MSG \ -"Non-default (user-constructed) CUDA RNG generators cannot be used " \ -"in regions captured by CUDA graphs. " \ -"If you need a non-default CUDA generator in a captured region, " \ -"please file an issue." +"In regions captured by CUDA graphs, you may only use the default CUDA RNG " \ +"generator on the device that's current when capture begins. " \ +"If you need a non-default (user-supplied) generator, or a generator on another " \ +"device, please file an issue." /** * Gets the current seed of CUDAGeneratorImpl. */ uint64_t CUDAGeneratorImpl::current_seed() const { - TORCH_CHECK((at::cuda::currentStreamCaptureStatus() == - at::cuda::CaptureStatus::None) || - ((void*)this == - (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index())), - CAPTURE_DEFAULT_GENS_MSG); + // Debatable if current_seed() should be allowed in captured regions. + // Conservatively disallow it for now. + at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed"); return seed_; } @@ -151,25 +149,21 @@ uint64_t CUDAGeneratorImpl::philox_offset_per_thread() { } /** - * Prepares this instance for a cuda graph capture region. + * Called by CUDAGraph to prepare this instance for a graph capture region. * offset_extragraph is the initial offset at the start of the graphed region. * offset_intragraph tracks the offset in the graphed region. */ -void CUDAGeneratorImpl::graph_prologue(int64_t* offset_extragraph) { - TORCH_CHECK((void*)this == - (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index()), - CAPTURE_DEFAULT_GENS_MSG); +void CUDAGeneratorImpl::capture_prologue(int64_t* offset_extragraph) { offset_extragraph_ = offset_extragraph; offset_intragraph_ = 0; + graph_expects_this_gen_ = true; } /** - * Finalizes a cuda graph capture region for this instance. + * Called by CUDAGraph to finalize a graph capture region for this instance. */ -uint64_t CUDAGeneratorImpl::graph_epilogue() { - TORCH_CHECK((void*)this == - (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index()), - CAPTURE_DEFAULT_GENS_MSG); +uint64_t CUDAGeneratorImpl::capture_epilogue() { + graph_expects_this_gen_ = false; return offset_intragraph_; } @@ -187,7 +181,7 @@ uint64_t CUDAGeneratorImpl::graph_epilogue() { * it intends to generate. * * Increment should be at least the number of curand() random numbers used in - * each thread. It is the user's responsibility to make sure that the increment + * each thread. It is the user's responsibility to make sure the increment * for philox is never smaller than the number of curand() calls. Increment * value > the number of curand() calls won't harm but anything less would mean * that you would be reusing random values from previous calls. @@ -196,17 +190,20 @@ uint64_t CUDAGeneratorImpl::graph_epilogue() { */ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) { if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) { - TORCH_CHECK((void*)this == - (void*)&at::cuda::detail::getDefaultCUDAGenerator(device_.index()), + TORCH_CHECK(graph_expects_this_gen_, + "philox_cuda_state for an unexpected CUDA generator used during capture. " CAPTURE_DEFAULT_GENS_MSG); uint32_t offset = this->offset_intragraph_; TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <= - std::numeric_limits::max() - increment); + std::numeric_limits::max() - increment); this->offset_intragraph_ += increment; return PhiloxCudaState(this->seed_, this->offset_extragraph_, offset); } else { + TORCH_CHECK(!graph_expects_this_gen_, + "CUDA generator expects graph capture to be underway, " + "but the current stream is not capturing."); uint64_t offset = this->philox_offset_per_thread_; this->philox_offset_per_thread_ += increment; return PhiloxCudaState(this->seed_, offset); diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp new file mode 100644 index 000000000000..74cc5ca09793 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -0,0 +1,168 @@ +#include +#include +#include +#include + +namespace at { +namespace cuda { + +/** + * Note [CUDA Graph Wrapper Class] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Q: Why do we need graph capture and launch bindings in Pytorch? + * Why can't they live in a user extension, for example? + * + * A1: Convenience. + * A2: To ensure valid numerics on replay, some native CUDA ops (like RNG ops with + * CPU statefulness) need cooperation from the capture and replay bindings + * (see Note [CUDA Graph-safe RNG states] in CUDAGeneratorImpl.h). + * + * We can't expect users to know about this cooperation. If users write capture + * bindings naively in an extension, they likely won't interact with the native + * ops properly. Their graphs would yield invalid numerics on replay. + */ + +CUDAGraph::CUDAGraph() + // CUDAStreams may not be default-constructed. + : capture_stream_(at::cuda::getCurrentCUDAStream()) { +#if CUDA_VERSION < 11000 + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); +#endif +} + +void CUDAGraph::capture_begin() { +#if CUDA_VERSION >= 11000 + TORCH_CHECK(!has_graph_exec_, + "This CUDAGraph instance already owns a captured graph. " + "To capture a new graph, create a new instance."); + + // For now, a CUDAGraph instance only accommodates the default generator on the device that's + // current when capture begins. If any op in the captured region uses a non-default generator, + // or a generator on another device, the offending generator will throw an error. + // These restrictions simplify CUDAGraph, but could be relaxed in the future: + // in principle, the underlying Cuda calls do permit cross-device ops to be captured. + auto* gen = get_generator_or_default( + c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); + + auto options = TensorOptions().device(at::kCUDA).dtype(at::kLong); + offset_extragraph_ = at::empty({1}, options); + + gen->capture_prologue(offset_extragraph_.data_ptr()); + + auto stream = at::cuda::getCurrentCUDAStream(); + + TORCH_CHECK(stream != at::cuda::getDefaultCUDAStream(), + "CUDA graphs must be captured on a non-default stream. " + "(However, after capture, it's ok to replay them on the " + "default stream.)"); + + capture_stream_ = stream; + capture_gen_ = gen; + + // cudaStreamCaptureModeGlobal is the most conservative option to + // prevent potentially unsafe CUDA API calls during capture. See + // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85 + AT_CUDA_CHECK(cudaStreamBeginCapture(capture_stream_, cudaStreamCaptureModeGlobal)); + + // Stashes the current graph's uuid. + cudaStreamCaptureStatus status; + AT_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &id_)); + TORCH_INTERNAL_ASSERT(status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive); +#else + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); +#endif +} + +void CUDAGraph::capture_end() { +#if CUDA_VERSION >= 11000 + auto stream = at::cuda::getCurrentCUDAStream(); + + TORCH_CHECK(stream == capture_stream_, + "Capture must end on the same stream it began on."); + + AT_CUDA_CHECK(cudaStreamEndCapture(capture_stream_, &graph_)); + TORCH_CHECK(graph_ != NULL, "Invalid capture."); + has_graph_ = true; + + // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people, + // who prefer not to report error message through these arguments moving forward + // (they prefer return value, or errors on api calls internal to the capture) + AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0)); + has_graph_exec_ = true; + + auto* gen = get_generator_or_default( + c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); + TORCH_CHECK(gen == capture_gen_, + "Default CUDA RNG generator on current device at capture end " + "is different from default generator on current device " + "when capture began"); + wholegraph_increment_ = gen->capture_epilogue(); + + // Now that we've instantiated graph_ into graph_exec_, + // we don't need graph_ anymore. + AT_CUDA_CHECK(cudaGraphDestroy(graph_)); + has_graph_ = false; +#else + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); +#endif +} + +void CUDAGraph::replay() { +#if CUDA_VERSION >= 11000 + TORCH_CHECK(has_graph_exec_, + "Called CUDAGraph::replay without a preceding successful capture."); + + { + c10::OptionalDeviceGuard device_guard{capture_stream_.device()}; + + // Just like any RNG consumer kernel! + auto* gen = get_generator_or_default( + c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); + PhiloxCudaState rng_engine_inputs; + { + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_); + } + offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val)); + + // graph_exec_ may be replayed in any stream. + AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream())); + } +#else + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); +#endif +} + +void CUDAGraph::reset() { +#if CUDA_VERSION >= 11000 + // I'd prefer these checks throw exceptions, not print warnings, + // but the destructor calls reset(), and at least one CI build + // refuses to compile with a throwing destructor. + // + // Instead of calling reset() in the destructor to clean up, I could + // call reset() in the __del__ method of a thin Python wrapper, + // in which case reset would be allowed to throw exceptions. + // But Stackoverflow does not like user-defined __del__. + // __del__ prevents Graph instances from EVER being garbage collected + // if they participate in a reference cycle. + // And exceptions thrown in __del__ only print a warning anyway. + // + // Calling reset() in the C++ destructor, with warnings instead of exceptions + // if calls fail, is the compromise we chose. + if (has_graph_) { + C10_CUDA_CHECK_WARN(cudaGraphDestroy(graph_)); + } + if (has_graph_exec_) { + C10_CUDA_CHECK_WARN(cudaGraphExecDestroy(graph_exec_)); + } +#else + TORCH_CHECK(false, "CUDA graphs may only be used in Pytorch built with CUDA >= 11.0"); +#endif +} + +CUDAGraph::~CUDAGraph() { + reset(); +} + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h new file mode 100644 index 000000000000..387271715055 --- /dev/null +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +namespace at { +namespace cuda { + +struct TORCH_CUDA_API CUDAGraph { + CUDAGraph(); + ~CUDAGraph(); + + void capture_begin(); + void capture_end(); + void replay(); + void reset(); + + protected: +#if CUDA_VERSION >= 11000 + cudaGraph_t graph_ = NULL; + cudaGraphExec_t graph_exec_ = NULL; +#endif + + // internal states for error checking + bool has_graph_ = false; + bool has_graph_exec_ = false; + + // uuid, retrieved from Cuda + unsigned long long id_; + + // Stream on which capture began + at::cuda::CUDAStream capture_stream_; + + // Default generator on device where capture began + at::CUDAGeneratorImpl* capture_gen_; + + // RNG state trackers + at::Tensor offset_extragraph_; + uint64_t wholegraph_increment_; +}; + +} // namespace cuda +} // namespace at diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 28b9738034e7..00424ab83ba0 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -163,7 +163,9 @@ bool CUDAHooks::hasPrimaryContext(int64_t device_index) const { TORCH_CHECK(device_index >= 0 && device_index < at::cuda::device_count(), "hasPrimaryContext expects a valid device index, but got device_index=", device_index); unsigned int ctx_flags; - int ctx_is_active; + // In standalone tests of cuDevicePrimaryCtxGetState, I've seen the "active" argument end up with weird + // (garbage-looking nonzero) values when the context is not active, unless I initialize it to zero. + int ctx_is_active = 0; AT_CUDA_DRIVER_CHECK(CUDAHooks::nvrtc().cuDevicePrimaryCtxGetState(device_index, &ctx_flags, &ctx_is_active)); return ctx_is_active == 1; } diff --git a/test/test_cuda.py b/test/test_cuda.py index 6249c250ae2e..498d7e71620e 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -2895,6 +2895,209 @@ def test_max_large_axis(self): def test_to_numpy(self): self.assertRaises(TypeError, lambda: torch.empty(1, device="cuda").numpy()) + @unittest.skipIf((not TEST_CUDA) or + TEST_WITH_ROCM or + int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") + def test_graph_capture_simple(self): + s1 = torch.cuda.Stream() + + with torch.cuda.stream(s1): + a = torch.zeros((1000,), device="cuda") + a += 1 + g = torch.cuda._Graph() + g.capture_begin() + a += 1 + g.capture_end() + torch.cuda.current_stream().wait_stream(s1) + + g.replay() + g.replay() + + self.assertTrue(a.sum().item() == 3000.) + + @unittest.skipIf((not TEST_CUDA) or + TEST_WITH_ROCM or + int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") + def test_graph_rng_functional(self): + # The caching allocator isn't yet graph-safe. + # In this test, graphed regions try to ensure allocator safety by + # stashing references to all temporaries. This is why we use _fused_dropout + # instead of a public dropout API: _fused_dropout returns the mask temporary + # as well as the output, so we can stash references to both. + # + # TODO: + # Switch to public dropout API when the allocator is made graph-safe. + ops_with_kwargs = ((torch._fused_dropout, {"p": 0.1}), + (torch.nn.functional.rrelu, {"training": True}),) + size = 10000 + + def run(op, kwargs): + a = torch.randn((size,), device="cuda", dtype=torch.float) + + torch.cuda.manual_seed(5) + + # Control + eager_out = a + for _ in range(6): + out = op(eager_out, **kwargs) + # _fused_dropout returns a tuple, rrelu returns a bare tensor. + eager_out = out[0] if isinstance(out, tuple) else out + + graph_in = a.clone() + stream = torch.cuda.Stream() + stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(stream): + # warms up allocator so no mallocs occur in capture + refs = () + graph_out = graph_in + for _ in range(3): + out = op(graph_out, **kwargs) + refs += tuple(out) + graph_out = out[0] if isinstance(out, tuple) else out + del out, refs, graph_out + + torch.cuda.manual_seed(5) + + refs = () + g = torch.cuda._Graph() + g.capture_begin() + graph_out = graph_in + for _ in range(2): + out = op(graph_out, **kwargs) + refs += tuple(out) + graph_out = out[0] if isinstance(out, tuple) else out + g.capture_end() + torch.cuda.current_stream().wait_stream(stream) + + # Runs a graphed->eager->graphed sequence of RNG ops. + # replay() plays 2 invocations of the op, so the sequence has 6 + # invocations total, matching Control. + # replay() reads from graph_in and writes to graph_out. + g.replay() + out = op(graph_out, **kwargs) + out = op(out[0], **kwargs)[0] if isinstance(out, tuple) else op(out, **kwargs) + graph_in.copy_(out) + g.replay() + + # If replay() updated RNG state correctly, graph_out + # should now hold data equal to eager_out. + try: + self.assertEqual(eager_out, graph_out) + except Exception as e: + raise RuntimeError("Failed on ", op) from e + + # We hold references to all tensors used across streams up til this sync, + # so no need to call record_stream on those tensors. + torch.cuda.synchronize() + + for op, kwargs in ops_with_kwargs: + run(op, kwargs) + + @unittest.skipIf((not TEST_CUDA) or + TEST_WITH_ROCM or + int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") + def test_graph_rng_distributions(self): + # The caching allocator isn't yet graph-safe. + # In this test, all ops maintain static references to inputs and outputs + # that persist across replay(), so they should be safe to test with graphs, + # EXCEPT for multinomial which is a complicated compound op. + # + # TODO: + # Uncomment multinomial when the allocator is made graph-safe. + size = 10000 + input = torch.rand((size,), device="cuda", dtype=torch.float) + alloc = torch.empty((size,), device="cuda", dtype=torch.float) + + # Torch ops to test with sample args (tuple) and kwargs (dict) + torch_with_args = (("bernoulli", (input.clone(),), {}), + # ("multinomial", (input.clone(), size, True), {}), + # ("multinomial", (input.clone(), size // 2, False), {}), + ("normal", (input.clone() + 1, input.clone()), {}), + ("poisson", (input.clone(),), {}), + ("rand", (size,), {"device": "cuda", "dtype": torch.float}), + ("randint", (0, 3, (size,)), {"device": "cuda", "dtype": torch.float}), + ("randn", (size,), {"device": "cuda", "dtype": torch.float}),) + + # Tensor methods to test with sample args (tuple) + tensor_with_args = (("bernoulli_", (input.clone(),)), + ("cauchy_", ()), + ("exponential_", ()), + ("geometric_", (0.3,)), + ("log_normal_", ()), + ("normal_", ()), + ("random_", ()), + ("uniform_", ()),) + + def run(module, op, args, kwargs): + torch.cuda.manual_seed(5) + + # Each path runs a dummy op to increment the state a bit before creating controls. + if (module == "torch"): + dummy = getattr(torch, op)(*args, **kwargs) + control1 = getattr(torch, op)(*args, **kwargs) + control2 = getattr(torch, op)(*args, **kwargs) + else: + dummy = alloc.clone() + control1 = alloc.clone() + control2 = alloc.clone() + getattr(dummy, op)(*args) + getattr(control1, op)(*args) + getattr(control2, op)(*args) + + stream = torch.cuda.Stream() + stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(stream): + torch.cuda.manual_seed(5) + + g = torch.cuda._Graph() + if (module == "torch"): + g.capture_begin() + t1 = getattr(torch, op)(*args, **kwargs) + t2 = getattr(torch, op)(*args, **kwargs) + g.capture_end() + else: + t1 = alloc.clone() + t2 = alloc.clone() + g.capture_begin() + getattr(t1, op)(*args) + getattr(t2, op)(*args) + g.capture_end() + torch.cuda.current_stream().wait_stream(stream) + + try: + self.assertNotEqual(control1, t1) + self.assertNotEqual(control2, t2) + except Exception as e: + raise RuntimeError("Failed on " + module + "." + op) from e + + # Runs a dummy op prelude, as for controls, to make sure replay() + # picks up the dummy op's state increment. + if module == "torch": + dummy = getattr(torch, op)(*args, **kwargs) + else: + dummy = alloc.clone() + getattr(dummy, op)(*args) + + # Runs RNG ops that fill t1 and t2. + g.replay() + + try: + self.assertEqual(control1, t1) + self.assertEqual(control2, t2) + except Exception as e: + raise RuntimeError("Failed on " + module + "." + op) from e + + # We hold references to all tensors used across streams up til this sync, + # so no need to call record_stream on those tensors. + torch.cuda.synchronize() + + for op_with_args in torch_with_args: + run("torch", *op_with_args) + + for meth_with_args in tensor_with_args: + # Adds an empty dict for kwargs, which none of the Tensor methods use + run("Tensor", *(meth_with_args + ({},))) + class TestCudaComm(TestCase): def _test_broadcast(self, input): @@ -3279,5 +3482,6 @@ class TestNamedTupleInput_1(NamedTuple): self.assertEqual(expected_a, x.a) self.assertEqual(expected_b, x.b) + if __name__ == '__main__': run_tests() diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 6c9ad0d5d6e1..eca10839ae88 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -472,6 +472,7 @@ libtorch_python_cuda_core_sources = [ "torch/csrc/cuda/python_comm.cpp", "torch/csrc/cuda/Storage.cpp", "torch/csrc/cuda/Stream.cpp", + "torch/csrc/cuda/Graph.cpp", "torch/csrc/cuda/serialization.cpp", "torch/csrc/cuda/shared/cudart.cpp", "torch/csrc/cuda/shared/nvtx.cpp", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index a7f1f1b91c93..2a31552068a1 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -735,6 +735,10 @@ class _CudaEventBase: def synchronize(self) -> None: ... def ipc_handle(self) -> bytes: ... +# Defined in torch/csrc/cuda/Graph.cpp +class _CudaGraphBase: + ... + # Defined in torch/csrc/DataLoader.cpp def _set_worker_signal_handlers(*arg: Any) -> None: ... # THPModule_setWorkerSignalHandlers def _set_worker_pids(key: _int, child_pids: Tuple[_int, ...]) -> None: ... # THPModule_setWorkerPIDs diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index b23ab81ada93..3795b6e4f914 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -646,6 +646,7 @@ bool THCPComplexFloatStorage_init(PyObject *module); void THCPStream_init(PyObject *module); void THCPEvent_init(PyObject *module); +void THCPGraph_init(PyObject *module); #ifdef USE_CUDA PyMethodDef* THCPModule_methods(); @@ -786,6 +787,7 @@ PyObject* initModule() { THCPStream_init(module); THCPEvent_init(module); + THCPGraph_init(module); #endif auto set_module_attr = [&](const char* name, PyObject* v, bool incref = true) { diff --git a/torch/csrc/cuda/Graph.cpp b/torch/csrc/cuda/Graph.cpp new file mode 100644 index 000000000000..b258f00bcf90 --- /dev/null +++ b/torch/csrc/cuda/Graph.cpp @@ -0,0 +1,46 @@ +#include + +#include + +#include +#include + +#include + +// Cargo culted partially from csrc/distributed/c10d/init.cpp +// and partially from csrc/cuda/Stream.cpp. +// THCPStream_init is also declared at global scope. + +// Because THCPGraph_init is forward declared in the only consumer (csrc/Module.cpp) +// I don't think we need a Graph.h. + +template +using shared_ptr_class_ = py::class_>; + +void THCPGraph_init(PyObject *module) { + // Pybind11 patch notes say "py::module_" is more up-to-date syntax, + // but CI linter and some builds prefer "module". + auto torch_C_m = py::handle(module).cast(); + + shared_ptr_class_<::at::cuda::CUDAGraph>(module, "_CudaGraphBase") + .def(py::init<>()) + .def("capture_begin", + &::at::cuda::CUDAGraph::capture_begin, + py::call_guard(), + R"(``capture_begin`` begins Cuda graph capture on the current stream.)") + .def("capture_end", + &::at::cuda::CUDAGraph::capture_end, + py::call_guard(), + R"(``capture_end`` ends Cuda graph capture on the current stream. + After ``capture_end``, ``replay`` may be called on this instance.)") + .def("replay", + &::at::cuda::CUDAGraph::replay, + py::call_guard(), + R"(``replay`` replays the Cuda graph captured by this instance.)") + // reset is called in __del__ on the Python side + // (see class Graph in torch/cuda/streams.py for reasons and caveats) + .def("reset", + &::at::cuda::CUDAGraph::reset, + py::call_guard(), + R"(``reset`` deletes the graph currently held by this instance.)"); +} diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 0850b535fe30..a4068ac6d7f3 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -16,7 +16,7 @@ import threading from typing import List, Optional, Tuple, Union from ._utils import _get_device_index, _dummy_type -from .streams import Stream, Event +from .streams import Stream, Event, _Graph from .. import device as _device import torch._C diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py index 14345baf6abd..9c9c30a7ff29 100644 --- a/torch/cuda/streams.py +++ b/torch/cuda/streams.py @@ -8,6 +8,7 @@ # Define dummy base classes torch._C.__dict__['_CudaStreamBase'] = _dummy_type('_CudaStreamBase') torch._C.__dict__['_CudaEventBase'] = _dummy_type('_CudaEventBase') + torch._C.__dict__['_CudaGraphBase'] = _dummy_type('_CudaGraphBase') class Stream(torch._C._CudaStreamBase): r"""Wrapper around a CUDA stream. @@ -20,7 +21,7 @@ class Stream(torch._C._CudaStreamBase): device(torch.device or int, optional): a device on which to allocate the stream. If :attr:`device` is ``None`` (default) or a negative integer, this will use the current device. - priority(int, optional): priority of the stream. Can be either + priority(int, optional): priority of the stream. Can be either -1 (high priority) or 0 (low priority). By default, streams have priority 0. @@ -201,3 +202,5 @@ def __repr__(self): return ''.format(self._as_parameter_.value) else: return '' + +_Graph = torch._C._CudaGraphBase From be849ed1fdfeee3ac240cc6eb74105cc13a058fb Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Dec 2020 11:15:55 -0800 Subject: [PATCH 228/250] [PyTorch] Make tls_local_dispatch_key_set inlineable (#49264) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49264 FLAGS_disable_variable_dispatch had to go, but it looks like the only user was some benchmarks anyway. ghstack-source-id: 118480532 Test Plan: Small (order of 0.1% improvement) on Internal benchmarks Reviewed By: smessmer Differential Revision: D25489030 fbshipit-source-id: 63147bae783e7a45391dd70d86730e48d3e0cafc --- c10/core/impl/LocalDispatchKeySet.cpp | 22 +--------------------- c10/core/impl/LocalDispatchKeySet.h | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp index 358e6ef7e1f7..f984c40b39c0 100644 --- a/c10/core/impl/LocalDispatchKeySet.cpp +++ b/c10/core/impl/LocalDispatchKeySet.cpp @@ -5,10 +5,6 @@ namespace c10 { namespace impl { -C10_DEFINE_bool(disable_variable_dispatch, false, "This flag forcibly disables the Variable code paths from executing, which currently breaks profiling in the process."); - -namespace { - /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, /// thread_local is not supported. #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY @@ -18,26 +14,10 @@ thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) -static PODLocalDispatchKeySet raw_local_dispatch_key_set; +PODLocalDispatchKeySet raw_local_dispatch_key_set; #endif -} // anonymous namespace - -LocalDispatchKeySet tls_local_dispatch_key_set() { - // Hack until variable performance is fixed - // - // ezyang: I'm pretty unhappy about this implementation, it looks wrong - // to me, as it seems to be performing a mutation on - // raw_local_dispatch_key_set. I can't conveniently test the correct - // version though... - if (FLAGS_disable_variable_dispatch) { - raw_local_dispatch_key_set.set_excluded( - raw_local_dispatch_key_set.excluded() | autograd_dispatch_keyset); - } - return raw_local_dispatch_key_set; -} - void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set) { raw_local_dispatch_key_set = PODLocalDispatchKeySet { key_set.included_.raw_repr(), diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 5262b1d4d6c0..7039272babf6 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -23,8 +23,6 @@ namespace c10 { namespace impl { -C10_DECLARE_bool(disable_variable_dispatch); - // POD version of LocalDispatchKeySet. Declared here just so that // we can put it in the guards. struct C10_API PODLocalDispatchKeySet { @@ -54,7 +52,19 @@ struct C10_API LocalDispatchKeySet { DispatchKeySet excluded_; }; -C10_API LocalDispatchKeySet tls_local_dispatch_key_set(); +/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, +/// thread_local is not supported. +#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY + extern thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; +#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) + extern PODLocalDispatchKeySet raw_local_dispatch_key_set; +#endif + +inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() { + // Don't let people fiddle with the thread_local directly just + // because they include this header. + return raw_local_dispatch_key_set; +} // Internal, use ThreadLocalStateGuard C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set); From 9e3c25ff1d3ecb611f79ddef4da4414ff6db772a Mon Sep 17 00:00:00 2001 From: Hector Yuen Date: Mon, 14 Dec 2020 11:46:02 -0800 Subject: [PATCH 229/250] sls + layernorm test (#43799) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43799 Test Plan: https://www.internalfb.com/intern/testinfra/testconsole/testrun/3096224784866350/ Reviewed By: venkatacrc Differential Revision: D23383351 fbshipit-source-id: c312d481ad15bded83bea90beaaae7742d0c54b8 --- .../fakelowp/test/test_sls_8bit_nnpi_fp16.py | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py index c5aea77d7199..041dcce97dbf 100644 --- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py +++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py @@ -418,6 +418,147 @@ def test_small_sls(self, seed): ) assert 0 + @given(seed=st.integers(0, 65535)) + @settings(deadline=datetime.timedelta(seconds=10)) + def test_sls_layernorm(self, seed): + np.random.seed(seed) + workspace.ResetWorkspace() + + n = 2 + DIM = 3 + data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32) + + lengths = np.array([n], dtype=np.int32) + indices = np.array(range(n), dtype=np.int64) + weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32) + + pred_net = caffe2_pb2.NetDef() + pred_net.name = "pred" + pred_net.external_input.extend( + ["quantized_data", "weights", "indices", "lengths"] + ) + pred_net.external_output.append("Y_norm") + pred_net.external_output.append("Y_mean") + pred_net.external_output.append("Y_std") + + pred_net.op.add().CopyFrom( + core.CreateOperator( + "SparseLengthsWeightedSumFused8BitRowwise", + ["quantized_data", "weights", "indices", "lengths"], + ["Y"], + ) + ) + + pred_net.op.add().CopyFrom( + core.CreateOperator( + "LayerNorm", + ["Y"], + ["Y_norm", "Y_mean", "Y_std"], + epsilon=1e-4, + ) + ) + + ref_net = caffe2_pb2.NetDef() + ref_net.name = "ref" + ref_net.external_input.extend( + ["quantized_data", "weights", "indices", "lengths"] + ) + ref_net.external_output.append("Y_norm") + ref_net.external_output.append("Y_mean") + ref_net.external_output.append("Y_std") + + ref_net.op.add().CopyFrom( + core.CreateOperator( + "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI", + ["quantized_data", "weights", "indices", "lengths"], + ["Y"], + ) + ) + + ref_net.op.add().CopyFrom( + core.CreateOperator( + "LayerNormFakeFP16NNPI", + ["Y"], + ["Y_norm", "Y_mean", "Y_std"], + epsilon=1e-4, + axis=1, + elementwise_affine=False + ) + ) + + workspace.FeedBlob("data", data) + workspace.RunOperatorOnce( + core.CreateOperator( + "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"] + ) + ) + + quantized_data = workspace.FetchBlob("quantized_data") + + onnxified_net = onnxifi_caffe2_net( + pred_net, + {}, + max_batch_size=1, + max_seq_size=n, + debug=True, + adjust_batch=True, + use_onnx=False, + ) + print("before", pred_net) + print("after", onnxified_net) + workspace.FeedBlob("indices", indices) + workspace.FeedBlob("lengths", lengths) + workspace.FeedBlob("weights", weights) + + workspace.CreateNet(onnxified_net) + workspace.CreateNet(ref_net) + + workspace.RunNet(onnxified_net.name) + Y_glow = workspace.FetchBlob("Y_norm") + Y_mean_glow = workspace.FetchBlob("Y_mean") + Y_std_glow = workspace.FetchBlob("Y_std") + + workspace.RunNet(ref_net.name) + Y = workspace.FetchBlob("Y") + print("pre normalization", Y) + Y_ref = workspace.FetchBlob("Y_norm") + Y_mean_ref = workspace.FetchBlob("Y_mean") + Y_std_ref = workspace.FetchBlob("Y_std") + + # print(Y_ref, Y_glow) + # print(Y_ref.shape, Y_glow.shape) + + diff = np.abs(Y_ref - Y_glow) + max_err = np.max(diff, axis=1) + num_offenders = (max_err > 0).sum() + if num_offenders > 0: + np.set_printoptions(precision=12) + print( + "ref", + Y_ref.astype(np.float16).astype(np.float32), + "glow", + Y_glow.astype(np.float16).astype(np.float32), + ) + print_test_debug_info( + "slws_fused_8bit_rowwise_inv_scale", + { + "seed": seed, + "indices": indices, + "data": data, + "quantized_data": quantized_data, + "lengths": lengths, + "weights": weights, + "Y_norm_glow": Y_glow, + "Y_norm_ref": Y_ref, + "Y_mean_glow": Y_mean_glow, + "Y_std_glow": Y_std_glow, + "Y_mean_ref": Y_mean_ref, + "Y_std_ref": Y_std_ref, + "diff": diff, + "rowwise_diff": np.max(diff, axis=1), + }, + ) + assert 0 if __name__ == '__main__': From 6cfd7c38117ed48aaf85765f30f4c0f0c67ed37a Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Mon, 14 Dec 2020 12:16:28 -0800 Subject: [PATCH 230/250] Remove type annotations from signatures in html docs (#49294) Summary: One unintended side effect of moving type annotations inline was that those annotations now show up in signatures in the html docs. This is more confusing and ugly than it is helpful. An example for `MaxPool1d`: ![image](https://user-images.githubusercontent.com/98330/102010280-77f86900-3d3d-11eb-8f83-e7ee0991ed92.png) This makes the docs readable again. The parameter descriptions often already have type information, and there will be many cases where the type annotations will make little sense to the user (e.g., returning typevar T, long unions). Change to `MaxPool1d` example: ![image](https://user-images.githubusercontent.com/98330/102010304-91011a00-3d3d-11eb-860d-ffa174b4d43b.png) Note that once we can build the docs with Sphinx 3 (which is far off right now), we have two options to make better use of the extra type info in the annotations (some of which is useful): - `autodoc_type_aliases`, so we can leave things like large unions unevaluated to keep things readable - `autodoc_typehints = 'description'`, which moves the annotations into the parameter descriptions. Another, more labour-intensive option, is what vadimkantorov suggested in gh-44964: show annotations on hover. Could also be done with some foldout, or other optional way to make things visible. Would be nice, but requires a Sphinx contribution or plugin first. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49294 Reviewed By: glaringlee Differential Revision: D25535272 Pulled By: ezyang fbshipit-source-id: 5017abfea941a7ae8c4595a0d2bdf8ae8965f0c4 --- docs/source/conf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index fe1e2260be72..610f6efa0840 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -161,7 +161,7 @@ # TODO: verify this works as expected release = 'master' -# Customized html_title here. +# Customized html_title here. # Default is " ".join(project, release, "documentation") if not set if RELEASE: # remove hash (start with 'a') from version number if any @@ -192,6 +192,9 @@ # Disable docstring inheritance autodoc_inherit_docstrings = False +# Disable displaying type annotations, these can be very verbose +autodoc_typehints = 'none' + # -- katex javascript in header # @@ -253,9 +256,9 @@ def setup(app): add_css(css_file) # From PyTorch 1.5, we now use autogenerated files to document classes and -# functions. This breaks older references since +# functions. This breaks older references since # https://docs.pytorch.org/torch.html#torch.flip -# moved to +# moved to # https://docs.pytorch.org/torch/generated/torchflip.html # which breaks older links from blog posts, stack overflow answers and more. # To mitigate that, we add an id="torch.flip" in an appropriated place @@ -278,7 +281,7 @@ def visit_reference(self, node): # to autogenerated content anchor = ref_anchor[1] txt = node.parent.astext() - if txt == anchor or txt == anchor.split('.')[-1]: + if txt == anchor or txt == anchor.split('.')[-1]: self.body.append('

'.format(ref_anchor[1])) return old_call(self, node) Klass.visit_reference = visit_reference From 4188c374ceaf644f4fdb0d8774853465da31ca4d Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Mon, 14 Dec 2020 12:19:11 -0800 Subject: [PATCH 231/250] Refactor: use version instead of major version in windows build (#49156) Summary: Fixes https://github.com/pytorch/pytorch/issues/49219 1. update version instead of major version for env var of CUDA_VERSION 2. update related scripts Pull Request resolved: https://github.com/pytorch/pytorch/pull/49156 Reviewed By: glaringlee Differential Revision: D25535530 Pulled By: ezyang fbshipit-source-id: 0712227f2b06b45ee68efc42717c4308fea1abdc --- .../cimodel/data/simple/util/versions.py | 3 ++ .../cimodel/data/windows_build_definitions.py | 3 +- .circleci/config.yml | 20 +++++------ .circleci/scripts/windows_cuda_install.sh | 10 +++--- .circleci/scripts/windows_cudnn_install.sh | 12 +++---- .../build-parameters/pytorch-build-params.yml | 2 +- .../job-specs/pytorch-job-specs.yml | 4 +-- .../win-test-helpers/build_pytorch.bat | 34 ++++++------------- .../installation-helpers/install_magma.bat | 8 ++--- .../win-test-helpers/setup_pytorch_env.bat | 30 +++------------- 10 files changed, 46 insertions(+), 80 deletions(-) diff --git a/.circleci/cimodel/data/simple/util/versions.py b/.circleci/cimodel/data/simple/util/versions.py index 3c9186df13aa..53d3a837248c 100644 --- a/.circleci/cimodel/data/simple/util/versions.py +++ b/.circleci/cimodel/data/simple/util/versions.py @@ -29,3 +29,6 @@ def __init__(self, major, minor): self.minor = minor super().__init__([self.major, self.minor], "cuda") + + def __str__(self): + return f"{self.major}.{self.minor}" diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py index dea78411addb..c0e828eaab5e 100644 --- a/.circleci/cimodel/data/windows_build_definitions.py +++ b/.circleci/cimodel/data/windows_build_definitions.py @@ -86,10 +86,11 @@ def gen_tree(self): props_dict["executor"] = "windows-with-nvidia-gpu" props_dict["cuda_version"] = ( - miniutils.quote(str(self.cuda_version.major)) + miniutils.quote(str(self.cuda_version)) if self.cuda_version else "cpu" ) + props_dict["name"] = "_".join(name_parts) return [{key_name: props_dict}] diff --git a/.circleci/config.yml b/.circleci/config.yml index 8bdfb3c9c7bd..cdd66830986f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -325,7 +325,7 @@ pytorch_windows_params: &pytorch_windows_params default: "" cuda_version: type: string - default: "10" + default: "10.1" python_version: type: string default: "3.6" @@ -675,7 +675,7 @@ jobs: default: "" cuda_version: type: string - default: "10" + default: "10.1" python_version: type: string default: "3.6" @@ -737,7 +737,7 @@ jobs: default: "" cuda_version: type: string - default: "10" + default: "10.1" python_version: type: string default: "3.6" @@ -8077,7 +8077,7 @@ workflows: - postnightly - pytorch_windows_build: build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3 - cuda_version: "10" + cuda_version: "10.1" name: pytorch_windows_vs2019_py36_cuda10.1_build python_version: "3.6" use_cuda: "1" @@ -8086,7 +8086,7 @@ workflows: vc_year: "2019" - pytorch_windows_test: build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3 - cuda_version: "10" + cuda_version: "10.1" executor: windows-with-nvidia-gpu name: pytorch_windows_vs2019_py36_cuda10.1_test1 python_version: "3.6" @@ -8099,7 +8099,7 @@ workflows: vc_year: "2019" - pytorch_windows_test: build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3 - cuda_version: "10" + cuda_version: "10.1" executor: windows-with-nvidia-gpu name: pytorch_windows_vs2019_py36_cuda10.1_test2 python_version: "3.6" @@ -8112,7 +8112,7 @@ workflows: vc_year: "2019" - pytorch_windows_build: build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3 - cuda_version: "11" + cuda_version: "11.1" name: pytorch_windows_vs2019_py36_cuda11.1_build python_version: "3.6" use_cuda: "1" @@ -8121,7 +8121,7 @@ workflows: vc_year: "2019" - pytorch_windows_test: build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3 - cuda_version: "11" + cuda_version: "11.1" executor: windows-with-nvidia-gpu filters: branches: @@ -8140,7 +8140,7 @@ workflows: vc_year: "2019" - pytorch_windows_test: build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3 - cuda_version: "11" + cuda_version: "11.1" executor: windows-with-nvidia-gpu filters: branches: @@ -8204,7 +8204,7 @@ workflows: vc_year: "2019" - pytorch_windows_test: build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3 - cuda_version: "10" + cuda_version: "10.1" filters: branches: only: diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh index 8d615b674aa0..e5d0923032e8 100644 --- a/.circleci/scripts/windows_cuda_install.sh +++ b/.circleci/scripts/windows_cuda_install.sh @@ -1,13 +1,11 @@ #!/bin/bash set -eux -o pipefail -if [[ "$CUDA_VERSION" == "10" ]]; then - cuda_complete_version="10.1" +if [[ "$CUDA_VERSION" =~ ^10.* ]]; then cuda_installer_name="cuda_10.1.243_426.00_win10" msbuild_project_dir="CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions" cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1" -elif [[ "$CUDA_VERSION" == "11" ]]; then - cuda_complete_version="11.1" +elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then cuda_installer_name="cuda_11.1.0_456.43_win10" msbuild_project_dir="visual_studio_integration/CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions" cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1" @@ -16,7 +14,7 @@ else exit 1 fi -if [[ "${CUDA_VERSION}" != "10" && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then +if [[ "$CUDA_VERSION" =~ ^10.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then cuda_install_packages="${cuda_install_packages} Display.Driver" fi @@ -48,7 +46,7 @@ then export NVTOOLSEXT_PATH="C:\\Program Files\\NVIDIA Corporation\\NvToolsExt\\" fi -if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${cuda_complete_version}/bin/nvcc.exe" +if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/bin/nvcc.exe" then echo "CUDA installation failed" mkdir -p /c/w/build-results diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh index 529710af79b2..62f54615677e 100644 --- a/.circleci/scripts/windows_cudnn_install.sh +++ b/.circleci/scripts/windows_cudnn_install.sh @@ -1,12 +1,10 @@ #!/bin/bash set -eux -o pipefail -if [[ "$CUDA_VERSION" == "10" ]]; then - cuda_complete_version="10.1" - cudnn_installer_name="cudnn-10.1-windows10-x64-v7.6.4.38" -elif [[ "$CUDA_VERSION" == "11" ]]; then - cuda_complete_version="11.1" - cudnn_installer_name="cudnn-11.1-windows-x64-v8.0.5.39" +if [[ "$CUDA_VERSION" =~ ^10.* ]]; then + cudnn_installer_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38" +elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then + cudnn_installer_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39" else echo "CUDNN for CUDA_VERSION $CUDA_VERSION is not supported yet" exit 1 @@ -16,6 +14,6 @@ cudnn_installer_link="https://ossci-windows.s3.amazonaws.com/${cudnn_installer_n curl --retry 3 -O $cudnn_installer_link 7z x ${cudnn_installer_name}.zip -ocudnn -cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${cuda_complete_version}/" +cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/" rm -rf cudnn rm -f ${cudnn_installer_name}.zip diff --git a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml index e031e01ba846..c912a4fb690b 100644 --- a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml +++ b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml @@ -59,7 +59,7 @@ pytorch_windows_params: &pytorch_windows_params default: "" cuda_version: type: string - default: "10" + default: "10.1" python_version: type: string default: "3.6" diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index 8d8036ea9523..aa0e2d2c5581 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -237,7 +237,7 @@ jobs: default: "" cuda_version: type: string - default: "10" + default: "10.1" python_version: type: string default: "3.6" @@ -299,7 +299,7 @@ jobs: default: "" cuda_version: type: string - default: "10" + default: "10.1" python_version: type: string default: "3.6" diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index f41e5f7fcd1b..7165f75a0e41 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -37,33 +37,19 @@ if "%VC_VERSION%" == "" ( @echo on popd -if "%CUDA_VERSION%" == "9" goto cuda_build_9 -if "%CUDA_VERSION%" == "10" goto cuda_build_10 -if "%CUDA_VERSION%" == "11" goto cuda_build_11 -goto cuda_build_end +if not "%USE_CUDA%"=="1" goto cuda_build_end -:cuda_build_9 +set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION% -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2 -set CUDA_PATH_V9_2=%CUDA_PATH% +rem version transformer, for example 10.1 to 10_1. +set VERSION_SUFFIX=%CUDA_VERSION:.=_% +set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH% -goto cuda_build_common - -:cuda_build_10 - -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 -set CUDA_PATH_V10_1=%CUDA_PATH% - -goto cuda_build_common - -:cuda_build_11 - -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1 -set CUDA_PATH_V11_1=%CUDA_PATH% - -goto cuda_build_common - -:cuda_build_common +set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 +set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% +set CUDNN_ROOT_DIR=%CUDA_PATH% +set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt +set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH% set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat index d4821c1b1a8d..ab102a0ea423 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat @@ -1,9 +1,9 @@ -if "%CUDA_VERSION%" == "9" set CUDA_SUFFIX=cuda92 -if "%CUDA_VERSION%" == "10" set CUDA_SUFFIX=cuda101 -if "%CUDA_VERSION%" == "11" set CUDA_SUFFIX=cuda110 +rem remove dot in cuda_version, fox example 11.1 to 111 +set VERSION_SUFFIX=%CUDA_VERSION:.=% +set CUDA_SUFFIX=cuda%VERSION_SUFFIX% if "%CUDA_SUFFIX%" == "" ( - echo unknown CUDA version, please set `CUDA_VERSION` to 9, 10 or 11. + echo unknown CUDA version, please set `CUDA_VERSION` higher than 9.2 exit /b 1 ) diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat index e3625ae75e9e..a052a1b67d59 100644 --- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -46,33 +46,13 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% ) set DISTUTILS_USE_SDK=1 -if "%CUDA_VERSION%" == "9" goto cuda_build_9 -if "%CUDA_VERSION%" == "10" goto cuda_build_10 -if "%CUDA_VERSION%" == "11" goto cuda_build_11 -goto cuda_build_end +if not "%USE_CUDA%"=="1" goto cuda_build_end -:cuda_build_9 +set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION% -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2 -set CUDA_PATH_V9_2=%CUDA_PATH% - -goto cuda_build_common - -:cuda_build_10 - -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 -set CUDA_PATH_V10_1=%CUDA_PATH% - -goto cuda_build_common - -:cuda_build_11 - -set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1 -set CUDA_PATH_V11_1=%CUDA_PATH% - -goto cuda_build_common - -:cuda_build_common +rem version transformer, for example 10.1 to 10_1. +set VERSION_SUFFIX=%CUDA_VERSION:.=_% +set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH% set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH% From 6820745e28e6c777d35a2b874b3e5b627e58c3a0 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Mon, 14 Dec 2020 12:43:59 -0800 Subject: [PATCH 232/250] Revert D25489030: [PyTorch] Make tls_local_dispatch_key_set inlineable Test Plan: revert-hammer Differential Revision: D25489030 (https://github.com/pytorch/pytorch/commit/be849ed1fdfeee3ac240cc6eb74105cc13a058fb) Original commit changeset: 63147bae783e fbshipit-source-id: 6ce564979078f28ca9b7c80bc89ef492a2993806 --- c10/core/impl/LocalDispatchKeySet.cpp | 22 +++++++++++++++++++++- c10/core/impl/LocalDispatchKeySet.h | 16 +++------------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp index f984c40b39c0..358e6ef7e1f7 100644 --- a/c10/core/impl/LocalDispatchKeySet.cpp +++ b/c10/core/impl/LocalDispatchKeySet.cpp @@ -5,6 +5,10 @@ namespace c10 { namespace impl { +C10_DEFINE_bool(disable_variable_dispatch, false, "This flag forcibly disables the Variable code paths from executing, which currently breaks profiling in the process."); + +namespace { + /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, /// thread_local is not supported. #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY @@ -14,10 +18,26 @@ thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) -PODLocalDispatchKeySet raw_local_dispatch_key_set; +static PODLocalDispatchKeySet raw_local_dispatch_key_set; #endif +} // anonymous namespace + +LocalDispatchKeySet tls_local_dispatch_key_set() { + // Hack until variable performance is fixed + // + // ezyang: I'm pretty unhappy about this implementation, it looks wrong + // to me, as it seems to be performing a mutation on + // raw_local_dispatch_key_set. I can't conveniently test the correct + // version though... + if (FLAGS_disable_variable_dispatch) { + raw_local_dispatch_key_set.set_excluded( + raw_local_dispatch_key_set.excluded() | autograd_dispatch_keyset); + } + return raw_local_dispatch_key_set; +} + void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set) { raw_local_dispatch_key_set = PODLocalDispatchKeySet { key_set.included_.raw_repr(), diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 7039272babf6..5262b1d4d6c0 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -23,6 +23,8 @@ namespace c10 { namespace impl { +C10_DECLARE_bool(disable_variable_dispatch); + // POD version of LocalDispatchKeySet. Declared here just so that // we can put it in the guards. struct C10_API PODLocalDispatchKeySet { @@ -52,19 +54,7 @@ struct C10_API LocalDispatchKeySet { DispatchKeySet excluded_; }; -/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, -/// thread_local is not supported. -#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY - extern thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; -#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) - extern PODLocalDispatchKeySet raw_local_dispatch_key_set; -#endif - -inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() { - // Don't let people fiddle with the thread_local directly just - // because they include this header. - return raw_local_dispatch_key_set; -} +C10_API LocalDispatchKeySet tls_local_dispatch_key_set(); // Internal, use ThreadLocalStateGuard C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set); From 86cf1e135879238a9fd8d00f06e5096e18f534ba Mon Sep 17 00:00:00 2001 From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com> Date: Mon, 14 Dec 2020 13:18:03 -0800 Subject: [PATCH 233/250] Add another way to verify ccache in CONTRIBUTING.md (#49337) Summary: In the case people are confused how to make sure ccache is working, I added another sentence in the documentation for how to check that the symlinks are correctly set up in addition to waiting for 2 clean builds of PyTorch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49337 Reviewed By: walterddr Differential Revision: D25535659 Pulled By: janeyx99 fbshipit-source-id: 435696255f517c074dd0d9f96534d22b60f795b2 --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f2981c0dbb37..51e9d1382808 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -674,7 +674,7 @@ ccache -M 25Gi ``` To check this is working, do two clean builds of pytorch in a row. The second -build should be substantially and noticeably faster than the first build. +build should be substantially and noticeably faster than the first build. If this doesn't seem to be the case, check that each of the symlinks above actually link to your installation of `ccache`. For example, if you followed the first option and installed `ccache` from source on a Linux machine, running `readlink -e $(which g++)` should return `~/ccache/bin/ccache`. #### Use a faster linker From d5a971e193c1f8ab83861c3ea258ddeb57d89f0c Mon Sep 17 00:00:00 2001 From: Amogh Akshintala Date: Mon, 14 Dec 2020 13:45:00 -0800 Subject: [PATCH 234/250] Add kernel launch checks in caffe2/aten/src/ATen/native/cuda/ (#49269) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49269 Added C10_CUDA_KERNEL_LAUNCH_CHECK(); after all kernel launches in caffe2/aten/src/ATen/native/cuda. Several files in the directory still trigger the check_kernel_launches.py tool. These are false positives as the tool doesn't seem to be parsing MACROS correctly. Normalization.cuh <- This file is also highlighted by the check_kernel_launches.py tool, but the highlighted regions are device code where exception handling isn't allowed. Test Plan: Check that the code still builds with ``` buck build //caffe2/aten:ATen-cu ``` https://pxl.cl/1tLRB Also ran ``` buck test //caffe2/aten:atest ``` https://pxl.cl/1tLSw Reviewed By: r-barnes Differential Revision: D25487597 fbshipit-source-id: 7a6689534f7ff85a5d2262831bf6918f1fe0b745 --- .../ATen/native/cuda/PersistentSoftmax.cuh | 126 +++++------------- aten/src/ATen/native/cuda/Repeat.cu | 1 + aten/src/ATen/native/cuda/SpectralOps.cu | 1 + aten/src/ATen/native/cuda/TensorFactories.cu | 2 + 4 files changed, 41 insertions(+), 89 deletions(-) diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh index 8265c5999376..051583a12a53 100644 --- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh +++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh @@ -258,50 +258,24 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele dim3 threads(warp_size, warps_per_block, 1); // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { - case 0: // 1 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 1: // 2 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 2: // 4 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 3: // 8 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 4: // 16 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 5: // 32 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 6: // 64 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 7: // 128 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 8: // 256 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 9: // 512 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; - case 10: // 1024 - softmax_warp_forward - <<>>(dst, src, batch_count, softmax_elements_stride, softmax_elements); - break; + #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E: \ + softmax_warp_forward \ + <<>>(dst, \ + src, batch_count, softmax_elements_stride, softmax_elements); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_FORWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_FORWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_FORWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_FORWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_FORWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_FORWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_FORWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_FORWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_FORWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_FORWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_FORWARD(10); ; // 1024 default: break; } @@ -333,53 +307,27 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const dim3 threads(warp_size, warps_per_block, 1); // Launch code would be more elegant if C++ supported FOR CONSTEXPR switch (log2_elements) { - case 0: // 1 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 1: // 2 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 2: // 4 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 3: // 8 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 4: // 16 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 5: // 32 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 6: // 64 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 7: // 128 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 8: // 256 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 9: // 512 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; - case 10: // 1024 - softmax_warp_backward - <<>>(grad_input, grad, output, batch_count, softmax_elements_stride, softmax_elements); - break; + #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E: \ + softmax_warp_backward \ + <<>> \ + (grad_input, grad, output, batch_count, softmax_elements_stride, \ + softmax_elements); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024 default: break; } } } - diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu index f70459928bf0..8437e80ebb48 100644 --- a/aten/src/ATen/native/cuda/Repeat.cu +++ b/aten/src/ATen/native/cuda/Repeat.cu @@ -23,6 +23,7 @@ static void compute_cuda(int64_t *repeat_ptr, int64_t *cumsum_ptr, int64_t *resu int64_t grid = std::min((size + warps_per_block - 1) / warps_per_block, 2048L); compute_cuda_kernel<<>>(repeat_ptr, cumsum_ptr, result_ptr, size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index 3ad0c06c69fc..db3e853a9321 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -125,6 +125,7 @@ void _fft_fill_with_conjugate_symmetry_cuda_( static_cast(in_data), input_offset_calculator, output_offset_calculator); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/TensorFactories.cu b/aten/src/ATen/native/cuda/TensorFactories.cu index a241f7df533c..effeef69f0cf 100644 --- a/aten/src/ATen/native/cuda/TensorFactories.cu +++ b/aten/src/ATen/native/cuda/TensorFactories.cu @@ -357,6 +357,7 @@ Tensor tril_indices_cuda( col, tril_size - rectangle_size, tril_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -434,6 +435,7 @@ Tensor triu_indices_cuda( col, rectangle_size, triu_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } From 1e2d1d7242ac28d2cf71c8a5d4a91be43b0acd90 Mon Sep 17 00:00:00 2001 From: vikigenius Date: Mon, 14 Dec 2020 14:14:28 -0800 Subject: [PATCH 235/250] Fixed cat transform to work with event_dim > 0 (#49111) Summary: Fixes https://github.com/pytorch/pytorch/issues/44530 As explained in the issue description, CatTransform does not work with event_dim > 0. This PR fixes this. If this gets approved I am hoping to do the same for StackTransform as well. fritzo Can you take a look at this ? Pull Request resolved: https://github.com/pytorch/pytorch/pull/49111 Reviewed By: neerajprad Differential Revision: D25526005 Pulled By: ezyang fbshipit-source-id: e14430093f550d5e0da7a311f9cd44796807830f --- test/distributions/test_distributions.py | 16 ++++++++++++++++ torch/distributions/transforms.py | 16 ++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index 67a66be19d84..b057d12a285d 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -4401,6 +4401,22 @@ def test_cat_transform_non_uniform(self): t2.log_abs_det_jacobian(x2, y2)], dim=dim) self.assertEqual(actual_jac, expected_jac) + def test_cat_event_dim(self): + t1 = AffineTransform(0, 2 * torch.ones(2), event_dim=1) + t2 = AffineTransform(0, 2 * torch.ones(2), event_dim=1) + dim = 1 + bs = 16 + x1 = torch.randn(bs, 2) + x2 = torch.randn(bs, 2) + x = torch.cat([x1, x2], dim=1) + t = CatTransform([t1, t2], dim=dim, lengths=[2, 2]) + y1 = t1(x1) + y2 = t2(x2) + y = t(x) + actual_jac = t.log_abs_det_jacobian(x, y) + expected_jac = sum([t1.log_abs_det_jacobian(x1, y1), + t2.log_abs_det_jacobian(x2, y2)]) + def test_stack_transform(self): x1 = -1 * torch.arange(1, 101, dtype=torch.float) x2 = (torch.arange(1, 101, dtype=torch.float) - 1) / 100 diff --git a/torch/distributions/transforms.py b/torch/distributions/transforms.py index a0412d52df0d..4181db799b28 100644 --- a/torch/distributions/transforms.py +++ b/torch/distributions/transforms.py @@ -733,6 +733,7 @@ class CatTransform(Transform): """ def __init__(self, tseq, dim=0, lengths=None, cache_size=0): assert all(isinstance(t, Transform) for t in tseq) + self.event_dim = max(t.event_dim for t in tseq) if cache_size: tseq = [t.with_cache(cache_size) for t in tseq] super(CatTransform, self).__init__(cache_size=cache_size) @@ -784,9 +785,20 @@ def log_abs_det_jacobian(self, x, y): for trans, length in zip(self.transforms, self.lengths): xslice = x.narrow(self.dim, start, length) yslice = y.narrow(self.dim, start, length) - logdetjacs.append(trans.log_abs_det_jacobian(xslice, yslice)) + logdetjac = trans.log_abs_det_jacobian(xslice, yslice) + if trans.event_dim < self.event_dim: + logdetjac = _sum_rightmost(logdetjac, self.event_dim - trans.event_dim) + logdetjacs.append(logdetjac) start = start + length # avoid += for jit compat - return torch.cat(logdetjacs, dim=self.dim) + # Decide whether to concatenate or sum. + dim = self.dim + if dim >= 0: + dim = dim - x.dim() + dim = dim + self.event_dim + if dim < 0: + return torch.cat(logdetjacs, dim=dim) + else: + return sum(logdetjacs) @property def bijective(self): From 3a943e9f824f196b90f72fdfdf62aeb64d8a1930 Mon Sep 17 00:00:00 2001 From: Chester Liu Date: Mon, 14 Dec 2020 14:21:32 -0800 Subject: [PATCH 236/250] Use Unicode friendly API on Win32 in THAllocator (#47905) Summary: This replaces the narrow character set APIs with the wide character set ones in `THAllocator.cpp`. This fixes the potential crashes caused by passing non-ASCII characters in `torch::from_file` on Windows. See: https://github.com/pytorch/pytorch/issues/47422 Pull Request resolved: https://github.com/pytorch/pytorch/pull/47905 Reviewed By: zhangguanheng66 Differential Revision: D25399146 Pulled By: ezyang fbshipit-source-id: 0a183b65de171c48ed1718fa71e773224eaf196f --- .jenkins/pytorch/test.sh | 2 ++ aten/src/TH/THAllocator.cpp | 37 +++++++++++++--------- c10/util/Unicode.h | 29 +++++++++++++++++ test/test_torch.py | 42 ++++++++++++++++++------- torch/testing/_internal/common_utils.py | 26 ++++++++++++--- 5 files changed, 106 insertions(+), 30 deletions(-) create mode 100644 c10/util/Unicode.h diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 1f1f174e992e..8e9afd5c9bc3 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -11,6 +11,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh" echo "Testing pytorch" +export LANG=C.UTF-8 + if [[ "$BUILD_ENVIRONMENT" == *-slow-* ]]; then export PYTORCH_TEST_WITH_SLOW=1 export PYTORCH_TEST_SKIP_FAST=1 diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp index 55b42d2f9d27..53b67a17032f 100644 --- a/aten/src/TH/THAllocator.cpp +++ b/aten/src/TH/THAllocator.cpp @@ -6,6 +6,7 @@ #endif #include +#include /* stuff for mapped files */ #ifdef _WIN32 @@ -74,24 +75,26 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, #ifdef _WIN32 if (flags_ & TH_ALLOCATOR_MAPPED_SHAREDMEM) { // Shadowing - const char *filename; - const char *eventname; + const wchar_t *filename; + const wchar_t *eventname; + const std::wstring wFilename = c10::u8u16(filename_); + const std::wstring wEventname = c10::u8u16(eventname_); LARGE_INTEGER hfilesz; if (filename_[0] == '/') { - filename = filename_.c_str() + 1; - eventname = eventname_.c_str() + 1; + filename = wFilename.c_str() + 1; + eventname = wEventname.c_str() + 1; } else { - filename = filename_.c_str(); - eventname = eventname_.c_str(); + filename = wFilename.c_str(); + eventname = wEventname.c_str(); } hfilesz.QuadPart = size; if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { - event_ = CreateEvent(nullptr, FALSE, FALSE, eventname); + event_ = CreateEventW(nullptr, FALSE, FALSE, eventname); } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { - event_ = OpenEvent(EVENT_ALL_ACCESS, FALSE, eventname); + event_ = OpenEventW(EVENT_ALL_ACCESS, FALSE, eventname); } else { AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE"); } @@ -101,9 +104,9 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, } if (flags_ & TH_ALLOCATOR_MAPPED_EXCLUSIVE) { - handle_ = CreateFileMapping(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename); + handle_ = CreateFileMappingW(INVALID_HANDLE_VALUE, nullptr, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, filename); } else if (flags_ & TH_ALLOCATOR_MAPPED_NOCREATE) { - handle_ = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, filename); + handle_ = OpenFileMappingW(FILE_MAP_ALL_ACCESS, FALSE, filename); } else { AT_ERROR("Expected either TH_ALLOCATOR_MAPPED_EXCLUSIVE or TH_ALLOCATOR_MAPPED_NOCREATE"); } @@ -136,15 +139,21 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, AT_ERROR("TH_ALLOCATOR_MAPPED_FROMFD not supported on Windows"); } + // Shadowing + const wchar_t *filename; + const std::wstring wFilename = c10::u8u16(filename_); + + filename = wFilename.c_str(); + /* open file */ /* FILE_FLAG_RANDOM_ACCESS ? */ if (flags_) { - hfile = CreateFileA(filename_.c_str(), GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0); + hfile = CreateFileW(filename, GENERIC_READ|GENERIC_WRITE, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0); if (hfile == INVALID_HANDLE_VALUE) { AT_ERROR("could not open file <", filename_, "> in read-write mode; error code: <", GetLastError(), ">"); } } else { - hfile = CreateFileA(filename_.c_str(), GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); + hfile = CreateFileW(filename, GENERIC_READ, FILE_SHARE_WRITE|FILE_SHARE_READ, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); if (hfile == INVALID_HANDLE_VALUE) { AT_ERROR("could not open file <", filename_, "> in read-only mode; error code: <", GetLastError(), ">"); } @@ -181,11 +190,11 @@ THMapAllocator::THMapAllocator(WithFd, const char *filename, int fd, int flags, /* get map handle */ if (flags_) { - if ( (hmfile = CreateFileMapping(hfile, NULL, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) { + if ( (hmfile = CreateFileMappingW(hfile, NULL, PAGE_READWRITE, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) { AT_ERROR("could not create a map on file <", filename_, ">; error code: <", GetLastError(), ">"); } } else { - if ( (hmfile = CreateFileMapping(hfile, NULL, PAGE_WRITECOPY, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) { + if ( (hmfile = CreateFileMappingW(hfile, NULL, PAGE_WRITECOPY, hfilesz.HighPart, hfilesz.LowPart, NULL)) == NULL ) { AT_ERROR("could not create a map on file <", filename_, ">; error code: <", GetLastError(), ">"); } } diff --git a/c10/util/Unicode.h b/c10/util/Unicode.h new file mode 100644 index 000000000000..9cce93cc9b83 --- /dev/null +++ b/c10/util/Unicode.h @@ -0,0 +1,29 @@ +#pragma once + +#if defined(_WIN32) +#include +#include +#include +#endif + +namespace c10 { +#if defined(_WIN32) +inline std::wstring u8u16(const std::string& str) { + if (str.empty()) { + return std::wstring(); + } + int size_needed = MultiByteToWideChar( + CP_UTF8, 0, str.c_str(), static_cast(str.size()), NULL, 0); + TORCH_CHECK(size_needed > 0, "Error converting the content to Unicode"); + std::wstring wstr(size_needed, 0); + MultiByteToWideChar( + CP_UTF8, + 0, + str.c_str(), + static_cast(str.size()), + &wstr[0], + size_needed); + return wstr; +} +#endif +} diff --git a/test/test_torch.py b/test/test_torch.py index a8f87c6f2036..7773d6ce7703 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -19,10 +19,10 @@ from torch import multiprocessing as mp from torch.testing._internal.common_utils import ( TestCase, TEST_WITH_ROCM, run_tests, - IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, + IS_WINDOWS, IS_FILESYSTEM_UTF8_ENCODING, NO_MULTIPROCESSING_SPAWN, do_test_dtypes, IS_SANDCASTLE, IS_FBCODE, IS_REMOTE_GPU, load_tests, slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, BytesIOContext, - skipIfRocm, skipIfNoSciPy, + skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName, wrapDeterministicFlagAPITest, DeterministicGuard) from multiprocessing.reduction import ForkingPickler from torch.testing._internal.common_device_type import ( @@ -1852,15 +1852,14 @@ def test_storage_casts(self): self.assertEqual(complexdouble_storage.type(), 'torch.ComplexDoubleStorage') self.assertIs(complexdouble_storage.dtype, torch.complex128) - @unittest.skipIf(IS_WINDOWS, "TODO: need to fix this test case for Windows") def test_from_file(self): - size = 10000 - with tempfile.NamedTemporaryFile() as f: - s1 = torch.FloatStorage.from_file(f.name, True, size) + def assert_with_filename(filename): + size = 10000 + s1 = torch.FloatStorage.from_file(filename, True, size) t1 = torch.FloatTensor(s1).copy_(torch.randn(size)) # check mapping - s2 = torch.FloatStorage.from_file(f.name, True, size) + s2 = torch.FloatStorage.from_file(filename, True, size) t2 = torch.FloatTensor(s2) self.assertEqual(t1, t2, atol=0, rtol=0) @@ -1874,15 +1873,24 @@ def test_from_file(self): t2.fill_(rnum) self.assertEqual(t1, t2, atol=0, rtol=0) - @unittest.skipIf(IS_WINDOWS, "TODO: need to fix this test case for Windows") + # release the tensors + del s1, t1, s2, t2 + + with TemporaryFileName() as fname: + assert_with_filename(fname) + + if IS_FILESYSTEM_UTF8_ENCODING: + with TemporaryDirectoryName(suffix='中文') as dname, TemporaryFileName(dir=dname) as fname: + assert_with_filename(fname) + def test_torch_from_file(self): - size = 10000 - with tempfile.NamedTemporaryFile() as f: - s1 = torch.from_file(f.name, True, size, dtype=torch.float) + def assert_with_filename(filename): + size = 10000 + s1 = torch.from_file(filename, True, size, dtype=torch.float) t1 = torch.FloatTensor(s1).copy_(torch.randn(size)) # check mapping - s2 = torch.from_file(f.name, True, size, dtype=torch.float) + s2 = torch.from_file(filename, True, size, dtype=torch.float) t2 = torch.FloatTensor(s2) self.assertEqual(t1, t2, atol=0, rtol=0) @@ -1896,6 +1904,16 @@ def test_torch_from_file(self): t2.fill_(rnum) self.assertEqual(t1, t2, atol=0, rtol=0) + # release the tensors + del s1, t1, s2, t2 + + with TemporaryFileName() as fname: + assert_with_filename(fname) + + if IS_FILESYSTEM_UTF8_ENCODING: + with TemporaryDirectoryName(suffix='中文') as dname, TemporaryFileName(dir=dname) as fname: + assert_with_filename(fname) + def test_print(self): default_type = torch.Tensor().type() for t in torch._tensor_classes: diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index f8280f9fb57d..6577b1c4559f 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -21,6 +21,7 @@ import warnings import random import contextlib +import shutil import socket import subprocess import time @@ -300,11 +301,11 @@ def run_tests(argv=UNITTEST_ARGS): if IS_WINDOWS: @contextmanager - def TemporaryFileName(): + def TemporaryFileName(dir=None): # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile # opens the file, and it cannot be opened multiple times in Windows. To support Windows, # close the file after creation and try to remove it manually - f = tempfile.NamedTemporaryFile(delete=False) + f = tempfile.NamedTemporaryFile(delete=False, dir=dir) try: f.close() yield f.name @@ -312,10 +313,27 @@ def TemporaryFileName(): os.unlink(f.name) else: @contextmanager # noqa: T484 - def TemporaryFileName(): - with tempfile.NamedTemporaryFile() as f: + def TemporaryFileName(dir=None): + with tempfile.NamedTemporaryFile(dir=dir) as f: yield f.name +if IS_WINDOWS: + @contextmanager + def TemporaryDirectoryName(suffix=None): + # On Windows the directory created by TemporaryDirectory is likely to be removed prematurely, + # so we first create the directory using mkdtemp and then remove it manually + try: + dir_name = tempfile.mkdtemp(suffix=suffix) + yield dir_name + finally: + shutil.rmtree(dir_name) +else: + @contextmanager # noqa: T484 + def TemporaryDirectoryName(suffix=None): + with tempfile.TemporaryDirectory(suffix=suffix) as d: + yield d + +IS_FILESYSTEM_UTF8_ENCODING = sys.getfilesystemencoding() == 'utf-8' def _check_module_exists(name): r"""Returns if a top-level module with :attr:`name` exists *without** From 220b91660f5b1a098b7238d8f6f5ae95048685cd Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Mon, 14 Dec 2020 14:51:11 -0800 Subject: [PATCH 237/250] [pytorch] Expand PixelShuffle to support any number of batch dims (#49187) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49187 Expands the implementation of PixelShuffle to support any number of batch dimensions Test Plan: `buck test caffe2/test:nn -- test_pixel_shuffle` Reviewed By: mruberry Differential Revision: D25399058 fbshipit-source-id: ab0a7f593b276cafc9ebb46a177e2c1dce56d0de --- aten/src/ATen/native/PixelShuffle.cpp | 48 +++++++++----- test/test_nn.py | 90 ++++++++++++++++++++------- torch/nn/modules/pixelshuffle.py | 7 ++- 3 files changed, 107 insertions(+), 38 deletions(-) diff --git a/aten/src/ATen/native/PixelShuffle.cpp b/aten/src/ATen/native/PixelShuffle.cpp index 14c126f77bdf..e6301e682d77 100644 --- a/aten/src/ATen/native/PixelShuffle.cpp +++ b/aten/src/ATen/native/PixelShuffle.cpp @@ -4,31 +4,51 @@ #include #include +#include #include namespace at { namespace native { Tensor pixel_shuffle(const Tensor& self, int64_t upscale_factor) { - AT_ASSERTM(self.dim() == 4, - "pixel_shuffle expects 4D input, but got input with sizes ",self.sizes()); - int64_t b = self.size(0); - int64_t c = self.size(1); - int64_t h = self.size(2); - int64_t w = self.size(3); + TORCH_CHECK(self.dim() >= 3, + "pixel_shuffle expects input to have at least 3 dimensions, but got input with ", + self.dim(), " dimension(s)"); + // Format: (B1, ..., Bn), C, H, W + int64_t c = self.size(-3); + int64_t h = self.size(-2); + int64_t w = self.size(-1); + const auto NUM_NON_BATCH_DIMS = 3; + const auto last_batch_dim = self.sizes().end() - NUM_NON_BATCH_DIMS; + int64_t upscale_factor_squared = upscale_factor * upscale_factor; - AT_ASSERTM(c % upscale_factor_squared == 0, - "pixel_shuffle expects input channel to be divisible by square of " - "upscale_factor, but got input with sizes ", self.sizes(), - ", upscale_factor=", upscale_factor, - ", and self.size(1)=", c, " is not divisible by ", upscale_factor_squared); + TORCH_CHECK(c % upscale_factor_squared == 0, + "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of " + "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared); int64_t oc = c / upscale_factor_squared; int64_t oh = h * upscale_factor; int64_t ow = w * upscale_factor; - auto input_reshaped = self.reshape({b, oc, upscale_factor, upscale_factor, h, w}); - return input_reshaped.permute({0 /* b */, 1 /* oc */, 4 /* h */, 2 /* 1st upscale_factor */, 5 /* w */, 3 /* 2nd upscale_factor */}) - .reshape({b, oc, oh, ow}); + // First, reshape to expand the channels dim from c into 3 separate dims: (oc, upscale_factor, upscale_factor). + // This allows shuffling to be done next by permuting dims. + std::vector expanded_shape(self.sizes().begin(), last_batch_dim); + expanded_shape.insert(expanded_shape.end(), {oc, upscale_factor, upscale_factor, h, w}); + const auto input_expanded = self.reshape(expanded_shape); + + // Next, shuffle by permuting the new upscale_factor dims alongside the height and width dims. + std::vector permutation(self.sizes().begin(), last_batch_dim); + // std::iota is used to maintain the batch dims within the permutation. + // Since expansion added 2 dims, the correct batch dim offsets are now: -expanded_shape.size(), ..., -7, -6. + std::iota(permutation.begin(), permutation.end(), -expanded_shape.size()); + permutation.insert(permutation.end(), {-5 /* oc */, -2 /* h */, -4 /* 1st upscale_factor */, -1 /* w */, + -3 /* 2nd upscale_factor */}); + const auto input_permuted = input_expanded.permute(permutation); + + // Finally, upscale by collapsing (h, upscale_factor) -> a single dim (oh) + // and (w, upscale_factor) -> a single dim (ow). + std::vector final_shape(self.sizes().begin(), last_batch_dim); + final_shape.insert(final_shape.end(), {oc, oh, ow}); + return input_permuted.reshape(final_shape); } }} // namespace at::native diff --git a/test/test_nn.py b/test/test_nn.py index 2291acbc957a..652b4d85cbed 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -6487,16 +6487,6 @@ def test_RNN_change_dropout(self): self.assertNotEqual(output2.data, prev_output) prev_output = output1.data - def _verify_pixel_shuffle(self, input, output, upscale_factor): - for c in range(output.size(1)): - for h in range(output.size(2)): - for w in range(output.size(3)): - height_idx = h // upscale_factor - weight_idx = w // upscale_factor - channel_idx = (upscale_factor * (h % upscale_factor)) + (w % upscale_factor) + \ - (c * upscale_factor ** 2) - self.assertEqual(output[:, c, h, w], input[:, channel_idx, height_idx, weight_idx]) - def test_inplace_thnn(self): modules = [nn.ReLU, nn.ELU, nn.SELU, nn.CELU, nn.RReLU] for mod in modules: @@ -6527,18 +6517,74 @@ def test_noncontig_conv_grad_cuda(self, dtype=torch.float): self.assertEqual(result, input.grad.data, atol=dtype2prec_DONTUSE[dtype], rtol=0) def test_pixel_shuffle(self): - batch_size = random.randint(1, 3) - upscale_factor = random.randint(2, 5) - channels = random.randint(1, 4) * upscale_factor ** 2 - height = random.randint(5, 10) - width = random.randint(5, 10) - - input = torch.rand(batch_size, channels, height, width, requires_grad=True) - ps = nn.PixelShuffle(upscale_factor) - output = ps(input) - self._verify_pixel_shuffle(input.data, output.data, upscale_factor) - output.backward(output.data) - self.assertEqual(input.data, input.grad.data) + def _test_pixel_shuffle_helper(num_input_dims, valid_channels_dim=True): + # Function to imperatively ensure pixels are shuffled to the correct locations. + # Used to validate the batch operations in pixel_shuffle. + def _verify_pixel_shuffle(input, output, upscale_factor): + for c in range(output.size(-3)): + for h in range(output.size(-2)): + for w in range(output.size(-1)): + height_idx = h // upscale_factor + weight_idx = w // upscale_factor + channel_idx = (upscale_factor * (h % upscale_factor)) + (w % upscale_factor) + \ + (c * upscale_factor ** 2) + self.assertEqual(output[..., c, h, w], input[..., channel_idx, height_idx, weight_idx]) + + upscale_factor = random.randint(2, 5) + # If valid_channels_dim=False, add 1 to make channels dim indivisible by upscale_factor ** 2. + channels = random.randint(1, 4) * upscale_factor ** 2 + (0 if valid_channels_dim else 1) + height = random.randint(5, 10) + width = random.randint(5, 10) + + if num_input_dims == 1: + input = torch.rand(channels, requires_grad=True) + elif num_input_dims == 2: + input = torch.rand(height, width, requires_grad=True) + else: + batch_sizes = [random.randint(1, 3) for _ in range(num_input_dims - 3)] + input = torch.rand(*batch_sizes, channels, height, width, requires_grad=True) + ps = nn.PixelShuffle(upscale_factor) + + if num_input_dims >= 3 and valid_channels_dim: + output = ps(input) + _verify_pixel_shuffle(input, output, upscale_factor) + output.backward(output.data) + self.assertEqual(input.data, input.grad.data) + else: + self.assertRaises(RuntimeError, lambda: ps(input)) + + def test_pixel_shuffle_1D(): + _test_pixel_shuffle_helper(num_input_dims=1) + + def test_pixel_shuffle_2D(): + _test_pixel_shuffle_helper(num_input_dims=2) + + def test_pixel_shuffle_3D_with_valid_channels_dim(): + _test_pixel_shuffle_helper(num_input_dims=3) + + def test_pixel_shuffle_4D_with_valid_channels_dim(): + _test_pixel_shuffle_helper(num_input_dims=4) + + def test_pixel_shuffle_5D_with_valid_channels_dim(): + _test_pixel_shuffle_helper(num_input_dims=5) + + def test_pixel_shuffle_3D_with_invalid_channels_dim(): + _test_pixel_shuffle_helper(num_input_dims=3, valid_channels_dim=False) + + def test_pixel_shuffle_4D_with_invalid_channels_dim(): + _test_pixel_shuffle_helper(num_input_dims=4, valid_channels_dim=False) + + def test_pixel_shuffle_5D_with_invalid_channels_dim(): + _test_pixel_shuffle_helper(num_input_dims=5, valid_channels_dim=False) + + test_pixel_shuffle_1D() + test_pixel_shuffle_2D() + test_pixel_shuffle_3D_with_valid_channels_dim() + test_pixel_shuffle_4D_with_valid_channels_dim() + test_pixel_shuffle_5D_with_valid_channels_dim() + test_pixel_shuffle_3D_with_invalid_channels_dim() + test_pixel_shuffle_4D_with_invalid_channels_dim() + test_pixel_shuffle_5D_with_invalid_channels_dim() def test_elu_inplace_view(self): v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True) diff --git a/torch/nn/modules/pixelshuffle.py b/torch/nn/modules/pixelshuffle.py index 3c8c626047dc..8256b111b988 100644 --- a/torch/nn/modules/pixelshuffle.py +++ b/torch/nn/modules/pixelshuffle.py @@ -15,12 +15,15 @@ class PixelShuffle(Module): `Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network`_ by Shi et. al (2016) for more details. + Note that this function can take inputs with any number of batch dimensions: + :math:`(L, H_{in}, W_{in})`, :math:`(N, L, H_{in}, W_{in})`, :math:`(N_1, N_2, L, H_{in}, W_{in})`, etc. + Args: upscale_factor (int): factor to increase spatial resolution by Shape: - - Input: :math:`(N, L, H_{in}, W_{in})` where :math:`L=C \times \text{upscale\_factor}^2` - - Output: :math:`(N, C, H_{out}, W_{out})` where + - Input: :math:`(*, L, H_{in}, W_{in})` where :math:`L=C \times \text{upscale\_factor}^2` + - Output: :math:`(*, C, H_{out}, W_{out})` where :math:`H_{out} = H_{in} \times \text{upscale\_factor}` and :math:`W_{out} = W_{in} \times \text{upscale\_factor}` From cb3169d7a8b257676abafb1bdd71f8fcfef8d49e Mon Sep 17 00:00:00 2001 From: Ansha Yu Date: Mon, 14 Dec 2020 15:35:43 -0800 Subject: [PATCH 238/250] [aten] index_select dim 1 (#47077) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47077 Add benchmarks for pt index_select, batch_index_select, and c2's BatchGather Add batch_index_select implementation based on the C2 BatchGather implementation This currently falls back to index_select for backwards and cuda implementations. Alternatively, we can look into the specifics of why index_select is slower and replace the original implementation instead. Test Plan: ./buck-out/opt/gen/caffe2/benchmarks/operator_benchmark/c2/batch_gather_test.par ./buck-out/opt/gen/caffe2/benchmarks/operator_benchmark/pt/index_select_test.par PT results comparing without fix, block_size 1 only, and all dim=1 ``` # no optimization # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M256_N512_K1_dim1_cpu # Input: M: 256, N: 512, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 353.450 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M512_N512_K1_dim1_cpu # Input: M: 512, N: 512, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 862.492 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M256_N512_K2_dim1_cpu # Input: M: 256, N: 512, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 4555.344 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M512_N512_K2_dim1_cpu # Input: M: 512, N: 512, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 11003.279 ``` ``` # block size 1 only # ---------------------------------------- # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M256_N512_K1_dim1_cpu # Input: M: 256, N: 512, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 129.240 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M512_N512_K1_dim1_cpu # Input: M: 512, N: 512, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 266.776 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M256_N512_K2_dim1_cpu # Input: M: 256, N: 512, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 4508.593 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M512_N512_K2_dim1_cpu # Input: M: 512, N: 512, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 10391.655 ``` ``` # dim 1 # PyTorch/Caffe2 Operator Micro-benchmarks # ---------------------------------------- # Tag : short # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M8_N8_K1_dim1_cpu # Input: M: 8, N: 8, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 3.736 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M256_N512_K1_dim1_cpu # Input: M: 256, N: 512, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 130.460 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M512_N512_K1_dim1_cpu # Input: M: 512, N: 512, K: 1, dim: 1, device: cpu Forward Execution Time (us) : 267.706 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M8_N8_K2_dim1_cpu # Input: M: 8, N: 8, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 4.187 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M256_N512_K2_dim1_cpu # Input: M: 256, N: 512, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 1739.550 # Benchmarking PyTorch: index_select # Mode: Eager # Name: index_select_M512_N512_K2_dim1_cpu # Input: M: 512, N: 512, K: 2, dim: 1, device: cpu Forward Execution Time (us) : 3468.332 ``` C2 results: ```# Benchmarking Caffe2: batch_gather WARNING: Logging before InitGoogleLogging() is written to STDERR W1203 13:19:35.310904 782584 init.h:137] Caffe2 GlobalInit should be run before any other API calls. # Name: batch_gather_M8_N8_K1_devicecpu # Input: M: 8, N: 8, K: 1, device: cpu Forward Execution Time (us) : 0.308 # Benchmarking Caffe2: batch_gather # Name: batch_gather_M256_N512_K1_devicecpu # Input: M: 256, N: 512, K: 1, device: cpu Forward Execution Time (us) : 90.517 # Benchmarking Caffe2: batch_gather # Name: batch_gather_M512_N512_K1_devicecpu # Input: M: 512, N: 512, K: 1, device: cpu Forward Execution Time (us) : 200.009 # Benchmarking Caffe2: batch_gather # Name: batch_gather_M8_N8_K2_devicecpu # Input: M: 8, N: 8, K: 2, device: cpu Forward Execution Time (us) : 0.539 # Benchmarking Caffe2: batch_gather # Name: batch_gather_M256_N512_K2_devicecpu # Input: M: 256, N: 512, K: 2, device: cpu Forward Execution Time (us) : 1001.540 # Benchmarking Caffe2: batch_gather # Name: batch_gather_M512_N512_K2_devicecpu # Input: M: 512, N: 512, K: 2, device: cpu Forward Execution Time (us) : 2005.870 ``` buck test dper3/dper3/modules/low_level_modules/tests:single_operators_test -- test_batch_gather Reviewed By: hlu1 Differential Revision: D24630227 fbshipit-source-id: cd205a30d96a33d239f3266820ada9a90093cf91 --- .../ATen/native/TensorAdvancedIndexing.cpp | 86 +++++++++++++++++++ .../operator_benchmark/benchmark_caffe2.py | 7 +- .../c2/batch_gather_test.py | 56 ++++++++++++ .../pt/index_select_test.py | 57 ++++++++++++ 4 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 benchmarks/operator_benchmark/c2/batch_gather_test.py create mode 100644 benchmarks/operator_benchmark/pt/index_select_test.py diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 8b5fdd44d789..f3147bdf78aa 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -471,6 +471,87 @@ Tensor index_add(const Tensor & self, int64_t dim, const Tensor & index, const T return self.clone(at::MemoryFormat::Preserve).index_add_(dim, index, source); } +// Check that indices fall within dimension array size +// Avoid redispatch call to min/max +template +static void check_indexarray_range( + const IndexType* indices, + int64_t n, + IndexType indexing_axis_dim) { + for (auto i = 0; i < n; ++i) { + auto idx = indices[i]; + TORCH_CHECK( + 0 <= idx && idx < indexing_axis_dim, + "INDICES element is out of DATA bounds, id=", + idx, + " axis_dim=", + indexing_axis_dim); + } +} + +Tensor & index_select_out_cpu_dim1_( + Tensor & result_contig, const Tensor & self, const Tensor & index_contig) { + + auto self_contig = self.contiguous(); + const caffe2::TypeMeta dataType = self_contig.dtype(); + size_t item_bytesize = dataType.itemsize(); + + auto out = static_cast(result_contig.data_ptr()); + + auto src_base = static_cast(self_contig.data_ptr()); + + auto self_sizes = self_contig.sizes(); + auto outer_dims_product = c10::size_to_dim_(1, self_sizes); + auto block_size = c10::size_from_dim_(2, self_sizes); + auto block_bytesize = block_size * item_bytesize; + + auto src_indexing_axis_dim = self_sizes[1]; + auto src_batch_bytesize = self_sizes[1] * block_bytesize; + auto N = index_contig.numel(); + + auto gathered_batch_bytesize = N * block_bytesize; + + AT_DISPATCH_INDEX_TYPES( + index_contig.scalar_type(), "batch_index_select_compute", [&]() { + + const auto* idxs = index_contig.data_ptr(); + check_indexarray_range(idxs, N, src_indexing_axis_dim); + + // Special-case single-float copy for efficiency + if (self.scalar_type() == ScalarType::Float && block_size == 1) { + for (auto batch = 0; batch < outer_dims_product; ++batch) { + const float* src_floats = + (const float*)(src_base + batch * src_batch_bytesize); + float* dst_floats = (float*)(out + batch * gathered_batch_bytesize); + + for (auto i = 0; i < N; ++i) { + auto idx = idxs[i]; + if (idx < 0) { + idx = idx + src_indexing_axis_dim; + } + dst_floats[i] = src_floats[idx]; + } + } + } else { + // outer_dims_product specifies how many times we repeat inner dimensions, + // so we just iterate over it to cover all outer dimensions. + for (auto batch = 0; batch < outer_dims_product; ++batch) { + for (auto i = 0; i < N; ++i) { + auto idx = idxs[i]; + if (idx < 0) { + idx = idx + src_indexing_axis_dim; + } + + auto src = src_base + batch * src_batch_bytesize + idx * block_bytesize; + auto dst = out + batch * gathered_batch_bytesize + i * block_bytesize; + memcpy(dst, src, block_bytesize); + } + } + } + }); + return result_contig; +} + Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim, const Tensor & index) { dim = maybe_wrap_dim(dim, self.dim()); @@ -498,6 +579,11 @@ Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim return result; } + if (dim == 1 && result.is_contiguous()) { + // fast pass + return index_select_out_cpu_dim1_(result, self, index_contig); + } + auto selfSlice = self.select(dim, 0); auto resultSlice = result.select(dim, 0); auto selfSlice_data = selfSlice.data_ptr(); diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py index 4fb7fffb5a5d..b0534bd9722d 100644 --- a/benchmarks/operator_benchmark/benchmark_caffe2.py +++ b/benchmarks/operator_benchmark/benchmark_caffe2.py @@ -50,10 +50,15 @@ def tensor(self, shapes, dtype='float32', device='cpu'): Return: C2 tensor of dtype """ + return self.feed_tensor(benchmark_utils.numpy_random(dtype, *shapes), device) + + def feed_tensor(self, tensor, device='cpu'): + """ Similar to tensor, but can supply any data compatible with FeedBlob + """ blob_name = 'blob_' + str(Caffe2BenchmarkBase.tensor_index) dev = self._device_option(device) with core.DeviceScope(dev): - workspace.FeedBlob(blob_name, benchmark_utils.numpy_random(dtype, *shapes)) + workspace.FeedBlob(blob_name, tensor) Caffe2BenchmarkBase.tensor_index += 1 return blob_name diff --git a/benchmarks/operator_benchmark/c2/batch_gather_test.py b/benchmarks/operator_benchmark/c2/batch_gather_test.py new file mode 100644 index 000000000000..ff3d84b99b2b --- /dev/null +++ b/benchmarks/operator_benchmark/c2/batch_gather_test.py @@ -0,0 +1,56 @@ +import benchmark_caffe2 as op_bench_c2 +import operator_benchmark as op_bench +from benchmark_caffe2 import Caffe2BenchmarkBase # noqa +from caffe2.python import core +import numpy + + +"""Microbenchmarks for element-wise BatchGather operator.""" + +# Configs for C2 BatherGather operator +batch_gather_configs_short = op_bench.config_list( + attr_names=["M", "N", "K"], + attrs=[ + [8, 8, 1], + [256, 512, 1], + [512, 512, 1], + [8, 8, 2], + [256, 512, 2], + [512, 512, 2], + ], + cross_product_configs={ + 'device': ['cpu', 'cuda'], + }, + tags=["short"] +) + +batch_gather_configs_long = op_bench.cross_product_configs( + M=[128, 1024], + N=[128, 1024], + K=[1, 2], + device=['cpu', 'cuda'], + tags=["long"] +) + +class BatchGatherBenchmark(op_bench_c2.Caffe2BenchmarkBase): + def init(self, M, N, K, device): + self.input_one = self.tensor([M, N, K], device=device) + max_val = N + numpy.random.seed((1 << 32) - 1) + index_dim = numpy.random.randint(0, N) + self.index = self.feed_tensor(numpy.random.randint(0, max_val, index_dim), device=device) + self.output = self.tensor([M, index_dim, K], device=device) + self.set_module_name("batch_gather") + + def forward(self): + op = core.CreateOperator("BatchGather", [self.input_one, self.index], self.output) + return op + + +op_bench_c2.generate_c2_test( + batch_gather_configs_long + batch_gather_configs_short, BatchGatherBenchmark +) + + +if __name__ == "__main__": + op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/pt/index_select_test.py b/benchmarks/operator_benchmark/pt/index_select_test.py new file mode 100644 index 000000000000..8418edb2840b --- /dev/null +++ b/benchmarks/operator_benchmark/pt/index_select_test.py @@ -0,0 +1,57 @@ +import operator_benchmark as op_bench +import torch +import numpy + + +"""Microbenchmarks for index_select operator.""" + +# An example input from this configuration is M=4, N=4, dim=0. +index_select_configs_short = op_bench.config_list( + attr_names=["M", "N", "K", "dim"], + attrs=[ + [8, 8, 1, 1], + [256, 512, 1, 1], + [512, 512, 1, 1], + [8, 8, 2, 1], + [256, 512, 2, 1], + [512, 512, 2, 1], + ], + cross_product_configs={ + 'device': ['cpu', 'cuda'], + }, + tags=["short"] +) + + +index_select_configs_long = op_bench.cross_product_configs( + M=[128, 1024], + N=[128, 1024], + K=[1, 2], + dim=[1], + device=['cpu', 'cuda'], + tags=["long"] +) + + +class IndexSelectBenchmark(op_bench.TorchBenchmarkBase): + def init(self, M, N, K, dim, device): + max_val = N + numpy.random.seed((1 << 32) - 1) + index_dim = numpy.random.randint(0, N) + self.inputs = { + "input_one": torch.rand(M, N, K, device=device), + "dim" : dim, + "index" : torch.tensor(numpy.random.randint(0, max_val, index_dim), device=device), + } + self.set_module_name("index_select") + + def forward(self, input_one, dim, index): + return torch.index_select(input_one, dim, index) + + +op_bench.generate_pt_test(index_select_configs_short + index_select_configs_long, + IndexSelectBenchmark) + + +if __name__ == "__main__": + op_bench.benchmark_runner.main() From e2510a0b60232aba5160ceb18b6ece8c59a9b79d Mon Sep 17 00:00:00 2001 From: Amogh Akshintala Date: Mon, 14 Dec 2020 16:20:13 -0800 Subject: [PATCH 239/250] Add Kernel Launch Checks to files under caffe2/aten/THC (#49358) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49358 Added the header file (`c10/cuda/CUDAException.h`) where the `C10_CUDA_KERNEL_LAUNCH_CHECK` is defined as needed to files under `caffe2/aten/THC`, and then added `C10_CUDA_KERNEL_LAUNCH_CHECK()` calls after each kernel launch. In some cases, removed some extraneous ErrorChecks Test Plan: Checked that the code still builds with ``` buck build //caffe2/aten:ATen-cu ``` Also ran basic aten tests ``` buck test //caffe2/aten:atest ``` Reviewed By: r-barnes Differential Revision: D25541113 fbshipit-source-id: df1a50e14d291a86b24ca1746ac27fa586f9757c --- aten/src/THC/THCApply.cuh | 38 ++++++++++--------- aten/src/THC/THCReduceAll.cuh | 4 ++ aten/src/THC/THCTensorSort.cu | 8 ++-- aten/src/THC/generic/THCTensorIndex.cu | 27 +++++++------ aten/src/THC/generic/THCTensorMathMagma.cu | 4 ++ aten/src/THC/generic/THCTensorMathReduce.cu | 6 ++- aten/src/THC/generic/THCTensorMode.cu | 17 +++++---- aten/src/THC/generic/THCTensorRandom.cu | 8 ++++ .../src/THC/generic/THCTensorScatterGather.cu | 14 +++---- aten/src/THC/generic/THCTensorSort.cu | 18 +++++---- aten/src/THC/generic/THCTensorTopK.cu | 16 ++++---- 11 files changed, 96 insertions(+), 64 deletions(-) diff --git a/aten/src/THC/THCApply.cuh b/aten/src/THC/THCApply.cuh index 368f1566e84c..7e52e1a1130c 100644 --- a/aten/src/THC/THCApply.cuh +++ b/aten/src/THC/THCApply.cuh @@ -6,6 +6,7 @@ #include #include #include +#include // // This file contains pointwise operation functions and kernels that @@ -242,14 +243,11 @@ bool THC_pointwiseApply1(THCState* state, // (or vice versa), the contiguous tensor can be collapsed to one // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. -#define HANDLE_CASE(TYPE, A) \ - kernelPointwiseApply1 \ - <<>>( \ - OffsetInfo \ - (aInfo), \ - (TYPE) totalElements, op); +#define HANDLE_CASE(TYPE, A) \ + kernelPointwiseApply1 \ + <<>>( \ + OffsetInfo(aInfo), (TYPE) totalElements, op); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); #define HANDLE_A_CASE(TYPE, A) { \ switch (A) { \ @@ -298,6 +296,7 @@ bool THC_pointwiseApply1(THCState* state, uint64_t, 1> <<>>( aOffset, (uint64_t) totalElements, op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { #if CUDA_VERSION < 9000 @@ -310,6 +309,7 @@ bool THC_pointwiseApply1(THCState* state, uint64_t, -1> <<>>( aOffset, (uint64_t) totalElements, op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } #undef HANDLE_CASE @@ -392,16 +392,13 @@ bool THC_pointwiseApply2(THCState* state, // dimension, and the loop to translate the linear index to the array // index can be similarly collapsed. That is what this unrolling is for. #define HANDLE_CASE(TYPE, A, B) \ - kernelPointwiseApply2 \ + kernelPointwiseApply2 \ <<>>( \ - OffsetInfo \ - (aInfo), \ - OffsetInfo \ - (bInfo), \ - (TYPE) totalElements, op); + OffsetInfo(aInfo), \ + OffsetInfo(bInfo), \ + (TYPE) totalElements, op); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); + #define HANDLE_B_CASE(TYPE, A, B) { \ switch (B) { \ @@ -474,6 +471,7 @@ bool THC_pointwiseApply2(THCState* state, uint64_t, 1, 1> <<>>( aOffset, bOffset, (uint64_t) totalElements, op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { #if CUDA_VERSION < 9000 grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); @@ -488,6 +486,7 @@ bool THC_pointwiseApply2(THCState* state, uint64_t, -1, -1> <<>>( aOffset, bOffset, (uint64_t) totalElements, op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } #undef HANDLE_CASE @@ -598,7 +597,8 @@ bool THC_pointwiseApply3(THCState* state, (bInfo), \ OffsetInfo \ (cInfo), \ - (TYPE) totalElements, op); + (TYPE) totalElements, op); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); #define HANDLE_C_CASE(TYPE, A, B, C) { \ switch (C) { \ @@ -697,6 +697,7 @@ bool THC_pointwiseApply3(THCState* state, uint64_t, 1, 1, 1> <<>>( aOffset, bOffset, cOffset, (uint64_t) totalElements, op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { #if CUDA_VERSION < 9000 grid.x = min(at::cuda::getCurrentDeviceProperties()->multiProcessorCount * THC_APPLY_BLOCKS_PER_SM , grid.x); @@ -715,6 +716,7 @@ bool THC_pointwiseApply3(THCState* state, uint64_t, -1, -1, -1> <<>>( aOffset, bOffset, cOffset, (uint64_t) totalElements, op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } #undef HANDLE_CASE diff --git a/aten/src/THC/THCReduceAll.cuh b/aten/src/THC/THCReduceAll.cuh index 9546f85f61c9..af2e264e6528 100644 --- a/aten/src/THC/THCReduceAll.cuh +++ b/aten/src/THC/THCReduceAll.cuh @@ -10,6 +10,7 @@ // #include +#include #include #ifdef __HIP_PLATFORM_HCC__ @@ -209,6 +210,7 @@ void callReduceAll(THCState* state, <<>>( in, (IndexType) totalElements, init, modifyOp, reduceOp, (AccT*) scratchSpace); + C10_CUDA_KERNEL_LAUNCH_CHECK(); int numPass1Blocks = grid.x; getPass2ReduceBlockGrid(state, totalElements, grid, block); @@ -218,6 +220,7 @@ void callReduceAll(THCState* state, <<>>( numPass1Blocks, init, reduceOp, (AccT*) scratchSpace, devOut); + C10_CUDA_KERNEL_LAUNCH_CHECK(); THCudaFree(state, scratchSpace); } else { @@ -227,6 +230,7 @@ void callReduceAll(THCState* state, kernelReduceAll <<>>( in, (IndexType) totalElements, init, modifyOp, reduceOp, devOut); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } diff --git a/aten/src/THC/THCTensorSort.cu b/aten/src/THC/THCTensorSort.cu index 8969209a1bdc..189e73b909fb 100644 --- a/aten/src/THC/THCTensorSort.cu +++ b/aten/src/THC/THCTensorSort.cu @@ -1,5 +1,6 @@ #include #include +#include void THCudaLongTensor_fillSliceWithIndex(THCState* state, THCudaLongTensor* t, @@ -28,8 +29,10 @@ void THCudaLongTensor_fillSliceWithIndex(THCState* state, #define FILL_INDEX(T, DIM) \ fillSliceWithIndex \ - <<>>( \ - info, numSlices, sliceSize, info.strides[collapseDim]) + <<>>( \ + info, numSlices, sliceSize, info.strides[collapseDim]); \ + C10_CUDA_KERNEL_LAUNCH_CHECK() + if (THCTensor_canUse32BitIndexMath(state, t)) { TensorInfo info = @@ -59,6 +62,5 @@ void THCudaLongTensor_fillSliceWithIndex(THCState* state, } #undef FILL_INDEX - THCudaCheck(cudaGetLastError()); } } diff --git a/aten/src/THC/generic/THCTensorIndex.cu b/aten/src/THC/generic/THCTensorIndex.cu index 66ad275787f5..3f506d345714 100644 --- a/aten/src/THC/generic/THCTensorIndex.cu +++ b/aten/src/THC/generic/THCTensorIndex.cu @@ -4,6 +4,7 @@ #include #include +#include // Check tensor dimensions for index operations, and return the slice size. // src can be nullptr in case of indexFill: in that case it is ignored. @@ -127,11 +128,12 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; -#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ - indexCopySmallIndex \ - <<>>( \ - dstInfo, srcInfo, indicesInfo, \ - dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize); +#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, SRC_DIM, IDX_DIM) \ + indexCopySmallIndex \ + <<>>( \ + dstInfo, srcInfo, indicesInfo, \ + dstCopyDim, srcCopyDim, sliceSize, dstCopyDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); #define LARGE_INDEX(TENSOR_TYPE, TYPE, \ DST_DIM, SRC_DIM, IDX_DIM, IDX_IS_MAJOR) \ @@ -141,7 +143,8 @@ void THCTensor_(indexCopy)(THCState *state, THCTensor *dst, int dim, THCudaLongT dstInfo, srcInfo, indicesInfo, \ dstCopyDim, srcCopyDim, srcTotalSize, \ (IDX_IS_MAJOR) ? sliceSize : numIndices, \ - dstCopyDimSize); + dstCopyDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); @@ -307,11 +310,12 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT int mpc = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; -#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM) \ +#define SMALL_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM) \ indexFillSmallIndex \ - <<>>( \ - dstInfo, indicesInfo, \ - dstFillDim, sliceSize, dstFillDimSize, val); + <<>>( \ + dstInfo, indicesInfo, \ + dstFillDim, sliceSize, dstFillDimSize, val); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); #define LARGE_INDEX(TENSOR_TYPE, TYPE, DST_DIM, IDX_DIM, IDX_IS_MAJOR) \ indexFillLargeIndex \ @@ -319,7 +323,8 @@ void THCTensor_(indexFill)(THCState *state, THCTensor *dst, int dim, THCudaLongT dstInfo, indicesInfo, \ dstFillDim, sliceSize * numIndices, \ (IDX_IS_MAJOR) ? sliceSize : numIndices, \ - dstFillDimSize, val); + dstFillDimSize, val); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); dim3 smallIndexGrid(std::min(THCCeilDiv(sliceSize, (ptrdiff_t)128), (ptrdiff_t)(mpc * 8))); dim3 smallIndexBlock(std::min(sliceSize, (ptrdiff_t)128)); diff --git a/aten/src/THC/generic/THCTensorMathMagma.cu b/aten/src/THC/generic/THCTensorMathMagma.cu index 8c0dac0aa686..216a96443887 100644 --- a/aten/src/THC/generic/THCTensorMathMagma.cu +++ b/aten/src/THC/generic/THCTensorMathMagma.cu @@ -2,6 +2,8 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorMathMagma.cu" #else +#include + #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) #ifdef USE_MAGMA @@ -171,8 +173,10 @@ void THCTensor_(potri)(THCState *state, THCTensor *ra_, THCTensor *a, bool upper dim3 threads(128); if (uplo == 'U') { THCTensor_(copyUpperSymmetric)<<>>(input_data, n, len); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { THCTensor_(copyLowerSymmetric)<<>>(input_data, n, len); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } THCTensor_(freeCopyTo)(state, input, ra_); diff --git a/aten/src/THC/generic/THCTensorMathReduce.cu b/aten/src/THC/generic/THCTensorMathReduce.cu index 76f470ce7dfb..ce2f124215ca 100644 --- a/aten/src/THC/generic/THCTensorMathReduce.cu +++ b/aten/src/THC/generic/THCTensorMathReduce.cu @@ -41,9 +41,11 @@ void THCTensor_(renorm)(THCState *state, THCTensor* self, THCTensor* src, scalar dim3 threads(32); THCTensor_kernel_renorm - <<>> - (THCTensor_(data)(state, data), scalar_cast(value), size, scalar_cast(maxnorm)); + <<>>(THCTensor_(data)(state, data), + scalar_cast(value), size, scalar_cast(maxnorm)); + // Do not replace with C10_CUDA_KERNEL_LAUNCH_CHECK() yet as it exhibits different behaviour from THError(). + // THError() calls the an error handler, or throws std::runtime_error if a custom handler hasn't been registered. cudaError_t errcode = cudaGetLastError(); if(errcode != cudaSuccess) THError(cudaGetErrorString(errcode)); diff --git a/aten/src/THC/generic/THCTensorMode.cu b/aten/src/THC/generic/THCTensorMode.cu index 9fe955f3cf8d..8c428c9a5d1b 100644 --- a/aten/src/THC/generic/THCTensorMode.cu +++ b/aten/src/THC/generic/THCTensorMode.cu @@ -2,6 +2,7 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorMode.cu" #else +#include #include void THCTensor_(calculateMode)(THCState *state, @@ -235,14 +236,14 @@ void THCTensor_(mode)(THCState *state, // Macro that calls kernel --> note that we set the block dimensions here, and // the amount of shared memory - #define HANDLE_MODE(SIZE) \ - { \ - dim3 blockSize(SIZE / 2); \ -\ - int memsize = (sizeof(scalar_t) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \ - computeMode \ - <<>>( \ - THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \ + #define HANDLE_MODE(SIZE) \ + { \ + const dim3 blockSize(SIZE / 2); \ + const auto memsize = (sizeof(scalar_t) * SIZE) + (2 * SIZE * sizeof(unsigned int)); \ + computeMode \ + <<>>( \ + THCTensor_(data)(state, contiguous), tiValues, tiIndices, sliceSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ } // Tradeoff between compilation time and the number of specializations. Ideally we would have diff --git a/aten/src/THC/generic/THCTensorRandom.cu b/aten/src/THC/generic/THCTensorRandom.cu index f3ca8bf93b1b..1ef540ba3302 100644 --- a/aten/src/THC/generic/THCTensorRandom.cu +++ b/aten/src/THC/generic/THCTensorRandom.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF) @@ -39,6 +40,8 @@ void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCud THCudaLongTensor_data(state, larger_short), one, inputsize ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + at::Tensor smaller_short_wrapped = THTensor_wrap(smaller_short); at::Tensor smaller_wrapped = THTensor_wrap(smaller); at::Tensor larger_short_wrapped = THTensor_wrap(larger_short); @@ -57,6 +60,8 @@ void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCud THCudaLongTensor_data(state, larger_short), inputsize - h_large_c, h_large_c ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + scalar_t q_max = at::max(THTensor_wrap(_q)).item(); condDiv<<< inputBlockDim, BLOCK_SIZE, 0, c10::cuda::getCurrentCUDAStream()>>>( @@ -64,6 +69,7 @@ void THCTensor_(multinomialAliasSetup)(THCState *state, THCTensor *_probs, THCud THCudaLongTensor_data(state, _J), inputsize, q_max ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); THCudaLongTensor_free(state, smaller); THCudaLongTensor_free(state, larger); @@ -104,6 +110,8 @@ void THCTensor_(multinomialAliasDraw)(THCState *state, THCudaLongTensor *self, T THCTensor_(data)(state, uniform), THCTensor_(data)(state, bernoulli) ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + THCTensor_(free)(state, uniform); THCTensor_(free)(state, bernoulli); } diff --git a/aten/src/THC/generic/THCTensorScatterGather.cu b/aten/src/THC/generic/THCTensorScatterGather.cu index 832539d370ce..a1ab8d63f163 100644 --- a/aten/src/THC/generic/THCTensorScatterGather.cu +++ b/aten/src/THC/generic/THCTensorScatterGather.cu @@ -2,10 +2,13 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorScatterGather.cu" #else +#include + #define RUN(TYPE, DIMS, REAL) \ - THCudaTensor_gatherKernel \ - <<>>( \ - tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); + THCudaTensor_gatherKernel \ + <<>>( \ + tensorInfo, srcInfo, indexInfo, dim, (TYPE)totalElements); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); void THCTensor_(gather)(THCState* state, THCTensor *tensor, THCTensor *src, int dim, THCudaLongTensor *index) { @@ -61,19 +64,15 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor, switch (indexInfo.dims) { case 1: RUN(unsigned int, 1, scalar_t); - THCudaCheck(cudaGetLastError()); break; case 2: RUN(unsigned int, 2, scalar_t); - THCudaCheck(cudaGetLastError()); break; case 3: RUN(unsigned int, 3, scalar_t); - THCudaCheck(cudaGetLastError()); break; default: RUN(unsigned int, -1, scalar_t); - THCudaCheck(cudaGetLastError()); break; } } else { @@ -84,7 +83,6 @@ void THCTensor_(gather)(THCState* state, THCTensor *tensor, TensorInfo indexInfo = getTensorInfo(state, index); RUN(uint64_t, -1, scalar_t); - THCudaCheck(cudaGetLastError()); } } diff --git a/aten/src/THC/generic/THCTensorSort.cu b/aten/src/THC/generic/THCTensorSort.cu index b4da00a98b7f..e378fe03358e 100644 --- a/aten/src/THC/generic/THCTensorSort.cu +++ b/aten/src/THC/generic/THCTensorSort.cu @@ -2,6 +2,8 @@ #define THC_GENERIC_FILE "THC/generic/THCTensorSort.cu" #else +#include + // In alignment with default sort on a c++ map, this function // will permute key and value tensors identically, and // in such a way that the 'key' tensor is ordered numerically @@ -53,8 +55,9 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, dim3 block(blockSize); \ \ if (dir) { \ - bitonicSortKVInPlace, TYPE, SIZE> \ - <<>>( \ + bitonicSortKVInPlace, TYPE, SIZE> \ + <<>>( \ keyInfo, \ keySlices, \ (TYPE) keySliceSize, \ @@ -62,16 +65,19 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, valueInfo, \ (TYPE) valueInfo.strides[collapseValueDim], \ GTComp()); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ } else { \ - bitonicSortKVInPlace, TYPE, SIZE> \ - <<>>( \ + bitonicSortKVInPlace, TYPE, SIZE> \ + <<>>( \ keyInfo, \ keySlices, \ (TYPE) keySliceSize, \ (TYPE) keyInfo.strides[collapseKeyDim], \ valueInfo, \ (TYPE) valueInfo.strides[collapseValueDim], \ - LTComp()); \ + LTComp()); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ } \ } while (0) @@ -147,8 +153,6 @@ void THCTensor_(sortKeyValueInplace)(THCState* state, #undef HANDLE_CASE #undef HANDLE_SORT_CASE #undef HANDLE_A_CASE - - THCudaCheck(cudaGetLastError()); } void THCTensor_(sortViaThrust)(THCState* state, diff --git a/aten/src/THC/generic/THCTensorTopK.cu b/aten/src/THC/generic/THCTensorTopK.cu index 357b3f2e22f3..8d7bf7701c04 100644 --- a/aten/src/THC/generic/THCTensorTopK.cu +++ b/aten/src/THC/generic/THCTensorTopK.cu @@ -3,6 +3,7 @@ #else #include +#include void THCTensor_(topk)(THCState* state, THCTensor *topK, @@ -37,8 +38,8 @@ void THCTensor_(topk)(THCState* state, // is provided to the kernel for the arguments. #define RUN_K(INDEX_T, DIM, DIR) \ - gatherTopK \ - <<>>( \ + gatherTopK \ + <<>>( \ inputInfo, \ static_cast(sliceSize), \ static_cast(k), \ @@ -50,7 +51,8 @@ void THCTensor_(topk)(THCState* state, static_cast(topKSlices), \ static_cast(topKInfo.strides[collapseTopKDim]), \ indicesInfo, \ - static_cast(indicesInfo.strides[collapseIndicesDim])) + static_cast(indicesInfo.strides[collapseIndicesDim])); \ + C10_CUDA_KERNEL_LAUNCH_CHECK() #define RUN_DIR(INDEX_T, DIM) \ if (dir) { \ @@ -71,10 +73,10 @@ void THCTensor_(topk)(THCState* state, } #define RUN_T(INDEX_T) \ - TensorInfo inputInfo = \ - getTensorInfo(state, input); \ - TensorInfo topKInfo = \ - getTensorInfo(state, topK); \ + TensorInfo inputInfo = \ + getTensorInfo(state, input); \ + TensorInfo topKInfo = \ + getTensorInfo(state, topK); \ TensorInfo indicesInfo = \ getTensorInfo(state, indices); \ \ From 23e98e73f6fc6a33c4f41b5f37b8921cbf349fdb Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 14 Dec 2020 17:10:28 -0800 Subject: [PATCH 240/250] Fix Windows CUDA-11.1 test jobs (#49376) Summary: Fixes typo introduced by https://github.com/pytorch/pytorch/pull/49156 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49376 Reviewed By: seemethere Differential Revision: D25548524 Pulled By: malfet fbshipit-source-id: 6aa3d903f6105c576c009f05a6b9d29f32b35c47 --- .circleci/scripts/windows_cuda_install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh index e5d0923032e8..04a4c2ed43ff 100644 --- a/.circleci/scripts/windows_cuda_install.sh +++ b/.circleci/scripts/windows_cuda_install.sh @@ -14,7 +14,7 @@ else exit 1 fi -if [[ "$CUDA_VERSION" =~ ^10.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then +if [[ "$CUDA_VERSION" =~ ^11.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then cuda_install_packages="${cuda_install_packages} Display.Driver" fi From 50b361a8212cbb526752cca7a45e82f2e623d978 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Mon, 14 Dec 2020 17:21:49 -0800 Subject: [PATCH 241/250] Enable BF16 for indexing on CUDA (#48801) Summary: Fixes #{issue number} Pull Request resolved: https://github.com/pytorch/pytorch/pull/48801 Reviewed By: glaringlee Differential Revision: D25542914 Pulled By: ngimel fbshipit-source-id: 4113eb2729d15b40a89268172cc37122b5213624 --- aten/src/ATen/native/cuda/Indexing.cu | 149 ++++++++++++-------------- test/test_indexing.py | 2 +- test/test_torch.py | 3 +- 3 files changed, 70 insertions(+), 84 deletions(-) diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 2b81460c1a4b..d630d727019f 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -232,19 +232,17 @@ void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & va AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, value_.scalar_type(), "indexing_backward", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "indexing_backward", [&] { - indexing_backward_kernel<<>>( - sorted_indices.data_ptr(), - orig_indices.data_ptr(), - value_.data_ptr(), - src_.data_ptr(), - num_indices, - sliceSize, - strideBefore, - nElemBefore); - }); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + indexing_backward_kernel<<>>( + sorted_indices.data_ptr(), + orig_indices.data_ptr(), + value_.data_ptr(), + src_.data_ptr(), + num_indices, + sliceSize, + strideBefore, + nElemBefore); }); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (permuted) self.copy_(src_.permute(inversePerm)); } @@ -508,77 +506,73 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const cuda::detail::canUse32BitIndexMath(source) && cuda::detail::canUse32BitIndexMath(index)) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "index_add", [&] { - cuda::detail::TensorInfo selfInfo = - cuda::detail::getTensorInfo(self_); - int selfAddDim = selfInfo.collapseDims(dim); - selfInfo.reduceDim(selfAddDim); - AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () { - auto sourceInfo = - cuda::detail::getTensorInfo(source_); - int sourceAddDim = sourceInfo.collapseDims(dim); - sourceInfo.reduceDim(sourceAddDim); - - auto indexInfo = - cuda::detail::getTensorInfo(index); - indexInfo.collapseDims(); - - // A reasonable choice for when to have each thread iterate over - // index to choose - if (numIndex <= 16) { - if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) { - SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2); - } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) { - SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2); - } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) { - SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2); + cuda::detail::TensorInfo selfInfo = + cuda::detail::getTensorInfo(self_); + int selfAddDim = selfInfo.collapseDims(dim); + selfInfo.reduceDim(selfAddDim); + AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () { + auto sourceInfo = + cuda::detail::getTensorInfo(source_); + int sourceAddDim = sourceInfo.collapseDims(dim); + sourceInfo.reduceDim(sourceAddDim); + + auto indexInfo = + cuda::detail::getTensorInfo(index); + indexInfo.collapseDims(); + + // A reasonable choice for when to have each thread iterate over + // index to choose + if (numIndex <= 16) { + if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) { + SMALL_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2); + } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) { + SMALL_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2); + } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) { + SMALL_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2); + } else { + SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1); + } + } else { + bool indexIsMajor = indexShouldBeMajor(selfInfo, selfAddDim); + + if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) { + LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true); + } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true); } else { - SMALL_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1); + LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false); } - } else { - bool indexIsMajor = indexShouldBeMajor(selfInfo, selfAddDim); - - if (selfInfo.dims == 1 && sourceInfo.dims == 1 && indContig) { - LARGE_INDEX(scalar_t, index_t, unsigned int, 1, 1, -2, true); - } else if (selfInfo.dims == 2 && sourceInfo.dims == 2 && indContig) { - if (indexIsMajor) { - LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, true); - } else { - LARGE_INDEX(scalar_t, index_t, unsigned int, 2, 2, -2, false); - } - } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) { - if (indexIsMajor) { - LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true); - } else { - LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false); - } + } else if (selfInfo.dims == 3 && sourceInfo.dims == 3 && indContig) { + if (indexIsMajor) { + LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, true); } else { - LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true); + LARGE_INDEX(scalar_t, index_t, unsigned int, 3, 3, -2, false); } + } else { + LARGE_INDEX(scalar_t, index_t, unsigned int, -1, -1, -1, true); } - }); + } }); }); } else { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] { - AT_SKIP_BFLOAT16_IF_NOT_ROCM(scalar_t, "index_add", [&] { - cuda::detail::TensorInfo selfInfo = - cuda::detail::getTensorInfo(self_); - int selfAddDim = selfInfo.collapseDims(dim); - selfInfo.reduceDim(selfAddDim); - - cuda::detail::TensorInfo sourceInfo = - cuda::detail::getTensorInfo(source_); - int sourceAddDim = sourceInfo.collapseDims(dim); - sourceInfo.reduceDim(sourceAddDim); - - AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () { - cuda::detail::TensorInfo indexInfo = - cuda::detail::getTensorInfo(index); - indexInfo.collapseDims(); - - LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true); - }); + cuda::detail::TensorInfo selfInfo = + cuda::detail::getTensorInfo(self_); + int selfAddDim = selfInfo.collapseDims(dim); + selfInfo.reduceDim(selfAddDim); + + cuda::detail::TensorInfo sourceInfo = + cuda::detail::getTensorInfo(source_); + int sourceAddDim = sourceInfo.collapseDims(dim); + sourceInfo.reduceDim(sourceAddDim); + + AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () { + cuda::detail::TensorInfo indexInfo = + cuda::detail::getTensorInfo(index); + indexInfo.collapseDims(); + + LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true); }); }); } @@ -839,17 +833,10 @@ Tensor& index_select_out_cuda(Tensor& out, const Tensor& self, int64_t dim, TORCH_CHECK(self.dim() <= MAX_TENSORINFO_DIMS, DIM_WARNING); TORCH_CHECK(index.dim() <= MAX_TENSORINFO_DIMS, DIM_WARNING); -#if defined(__HIP_PLATFORM_HCC__) AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16, out.scalar_type(), "index_select_cuda", [&] { index_select_out_cuda_impl(out, self, dim, index); }); -#else // __HIP_PLATFORM_HCC__ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( - at::ScalarType::Half, at::ScalarType::Bool, - out.scalar_type(), "index_select_cuda", - [&] { index_select_out_cuda_impl(out, self, dim, index); }); -#endif // __HIP_PLATFORM_HCC__ return out; } diff --git a/test/test_indexing.py b/test/test_indexing.py index f3430a158d89..b92fd94e8cbd 100644 --- a/test/test_indexing.py +++ b/test/test_indexing.py @@ -764,7 +764,7 @@ def test_int_indices(self, device): @dtypes(torch.float, torch.bfloat16, torch.long, torch.bool) @dtypesIfCPU(torch.float, torch.long, torch.bool, torch.bfloat16) - @dtypesIfCUDA(torch.half, torch.long, torch.bool) + @dtypesIfCUDA(torch.half, torch.long, torch.bool, torch.bfloat16) def test_index_put_src_datatype(self, device, dtype): src = torch.ones(3, 2, 4, device=device, dtype=dtype) vals = torch.ones(3, 2, 4, device=device, dtype=dtype) diff --git a/test/test_torch.py b/test/test_torch.py index 7773d6ce7703..b4d9ad6f23c0 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -27,7 +27,7 @@ from multiprocessing.reduction import ForkingPickler from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, - skipCUDAIfNoMagma, skipCUDAIfRocm, skipCUDAIfNotRocm, + skipCUDAIfNoMagma, skipCUDAIfRocm, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyOnCPUAndCUDA, @@ -6256,7 +6256,6 @@ class TestDevicePrecision(TestCase): exact_dtype = True @onlyCUDA - @skipCUDAIfNotRocm def test_index_add_bfloat16(self, device): inp_tensor = torch.randn(5, 3, device='cpu').bfloat16() t = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=torch.bfloat16, device='cpu') From 626b8c0cf2fd6247827c02d363eaf4e399eda62f Mon Sep 17 00:00:00 2001 From: Bert Maher Date: Mon, 14 Dec 2020 17:38:28 -0800 Subject: [PATCH 242/250] [te] Ban uint8 tensors from fusion groups (#49247) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49247 uint8's expose all kind of corner cases in type promotion. As an example, consider: ``` >>> torch.tensor([1], dtype=torch.uint8).lt(-1) tensor([True]) >>> torch.tensor([1], dtype=torch.uint8).lt(torch.tensor(-1)) tensor([True]) >>> torch.tensor([1], dtype=torch.uint8).lt(torch.tensor([-1])) tensor([False]) ``` the difference is how promotions involving scalars (or 0-dim tensors, which are treated like scalars) are prioritized compared to tensor dtypes. Per eellison, the order is something like: 1. Tensor FP types 2. Scalar FP types 3. Tensor Int types 4. Scalar Int types The logic for this is here: https://github.com/pytorch/pytorch/blob/c73e97033a3aef97a5685588ea014d54a5cc11cc/aten/src/ATen/native/TypeProperties.cpp#L93 AFAICT the effects are mainly visible for the unsigned byte type (the only unsigned type, besides bool) since the others degrade more or less gracefully. It's hard to re-use this logic as is in TensorIterator/TypeProperties, and it's complicated enough that it's not worth re-implementing in TE unless there's evidence that it matters for real models. ghstack-source-id: 118555597 Test Plan: `buck test //caffe2/test:jit` Reviewed By: eellison Differential Revision: D25489035 fbshipit-source-id: db3ab84286d472fd8a247aeb7b36c441293aad85 --- test/test_jit_fuser_te.py | 138 +++++---------------- test/test_tensorexpr.py | 4 +- torch/csrc/jit/passes/tensorexpr_fuser.cpp | 32 +++-- 3 files changed, 53 insertions(+), 121 deletions(-) diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 956a115e6d56..8b04418fa640 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -71,6 +71,19 @@ def setUp(self): torch._C._jit_set_texpr_fuser_enabled(True) self.devices = ['cpu'] if not torch.cuda.is_available() else ['cpu', 'cuda'] + self.int_dtypes = [ + torch.int8, + torch.int16, + torch.int32, + torch.int64, + torch.bool, + ] + self.fp_dtypes = [ + torch.float16, + torch.float32, + torch.float64, + ] + self.dtypes = self.int_dtypes + self.fp_dtypes def tearDown(self): torch._C._jit_set_profiling_executor(self.old_profiling_executor) @@ -461,21 +474,13 @@ def test_bitwise_ops(self): def apply(fn): return lambda x, y, z: fn(fn(x, y), z) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.bool, - ] binary_ops = [ operator.__and__, operator.__or__, operator.__xor__ ] devices = self.devices - for dtype, op, device in product(dtypes, binary_ops, devices): + for dtype, op, device in product(self.int_dtypes, binary_ops, devices): try: x = self.data_for(dtype, device) y = self.data_for(dtype, device) @@ -500,20 +505,12 @@ def test_minmax_int_ops(self): def apply(fn): return lambda x, y, z: fn(fn(x, y), z) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.bool, - ] binary_ops = [ torch.min, torch.max ] devices = self.devices - for dtype, op, device in product(dtypes, binary_ops, devices): + for dtype, op, device in product(self.int_dtypes, binary_ops, devices): try: x = self.data_for(dtype, device) y = self.data_for(dtype, device) @@ -1215,17 +1212,6 @@ def test_unary_ops(self): def apply(fn): return lambda x: fn(x) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool, - ] unary_ops = [ torch.lgamma, torch.sigmoid, @@ -1262,7 +1248,7 @@ def apply(fn): lambda x: torch.clamp(x, -10, 10), ] sizes = [(1,), (2,), (4, 4)] - for dtype, op, device, size in product(dtypes, unary_ops, self.devices, sizes): + for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): try: x = self.data_for(dtype, device, size=size) fn = apply(op) @@ -1286,18 +1272,7 @@ def test_binary_ops(self): def apply(fn): return lambda x, y: fn(x, y) - dtypes = [ - # FIXME: Fails in IR Eval: torch.int8 and_ cpu - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool, - ] + # FIXME: Fails in IR Eval: torch.int8 and_ cpu binary_ops = [ operator.__and__, operator.__or__, @@ -1329,7 +1304,7 @@ def apply(fn): torch.remainder, ] devices = self.devices - for dtype, op, device in product(dtypes, binary_ops, devices): + for dtype, op, device in product(self.dtypes, binary_ops, devices): try: x = self.data_for(dtype, device) y = self.data_for(dtype, device) @@ -1355,18 +1330,7 @@ def test_binary_tensor_scalar_ops(self): def apply_with_scalar(fn, scalar): return lambda x: fn(x, scalar) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - # FIXME: Fails in IR Eval: torch.int64 and_ cpu - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool - ] + # FIXME: Fails in IR Eval: torch.int64 and_ cpu binary_ops = [ operator.__and__, operator.__or__, @@ -1376,11 +1340,9 @@ def apply_with_scalar(fn, scalar): torch.mul, torch.eq, torch.ne, - - # FIXME: fails with dtype=uint8, scalar=-1 - # torch.ge, - # torch.lt, - # torch.gt, + torch.ge, + torch.lt, + torch.gt, # FIXME: segfaults on CPU backend # operator.__rshift__, @@ -1390,7 +1352,7 @@ def apply_with_scalar(fn, scalar): # Maybe we should split this into separate tests to speed it up by # only using scalar values relevant to particular ops scalars = [1.5, 3, 0, -2.0, -1] - for dtype, op, device, scalar in product(dtypes, binary_ops, devices, scalars): + for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): try: x = self.data_for(dtype, device) fn = apply_with_scalar(op, scalar) @@ -1413,17 +1375,6 @@ def test_binary_div_ops(self): def apply_with_scalar(fn, scalar): return lambda x: fn(x, scalar) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool - ] binary_ops = [ torch.div, torch.remainder, @@ -1433,7 +1384,7 @@ def apply_with_scalar(fn, scalar): # Maybe we should split this into separate tests to speed it up by # only using scalar values relevant to particular ops scalars = [1.5, 3, -2.0, -1] # skip 0 - for dtype, op, device, scalar in product(dtypes, binary_ops, devices, scalars): + for dtype, op, device, scalar in product(self.dtypes, binary_ops, devices, scalars): try: x = self.data_for(dtype, device) fn = apply_with_scalar(op, scalar) @@ -1457,7 +1408,6 @@ def apply_with_scalar(fn, scalar): dtypes = [ torch.int8, - torch.uint8, torch.int16, torch.int32, torch.int64, @@ -1498,23 +1448,12 @@ def test_ternary_ops(self): def apply(fn): return lambda x, y, z: fn(x, y, z) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool, - ] ternary_ops = [ torch.lerp, torch.addcmul, ] devices = self.devices - for dtype, op, device in product(dtypes, ternary_ops, devices): + for dtype, op, device in product(self.dtypes, ternary_ops, devices): try: x = self.data_for(dtype, device) y = self.data_for(dtype, device) @@ -1540,22 +1479,11 @@ def test_list_ops(self): def apply(fn): return lambda x, y, z: fn([x * x, y * y, z * z]) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool, - ] devices = self.devices list_ops = [ torch.cat, ] - for dtype, op, device in product(dtypes, list_ops, devices): + for dtype, op, device in product(self.dtypes, list_ops, devices): try: x = self.data_for(dtype, device, size=[5, 4, 1, 7]) y = self.data_for(dtype, device, size=[5, 4, 1, 7]) @@ -1580,24 +1508,13 @@ def test_where_ops(self): def apply(fn): return lambda cond, x, y: fn(cond, x, y) - dtypes = [ - torch.int8, - torch.uint8, - torch.int16, - torch.int32, - torch.int64, - torch.float16, - torch.float32, - torch.float64, - torch.bool, - ] ops = [ torch.where, lambda cond, x, y: torch.where(cond, x, 3.1415), lambda cond, x, y: torch.where(cond, 42, y), ] devices = self.devices - for dtype, op, device in product(dtypes, ops, devices): + for dtype, op, device in product(self.dtypes, ops, devices): try: cond = self.data_for(torch.bool, device) x = self.data_for(dtype, device) @@ -1624,6 +1541,7 @@ def fn(x): return x * x + x unsupported_dtypes = [ + torch.uint8, torch.bfloat16, torch.complex32, torch.complex64, diff --git a/test/test_tensorexpr.py b/test/test_tensorexpr.py index eada68c9ff92..6cdccf468326 100644 --- a/test/test_tensorexpr.py +++ b/test/test_tensorexpr.py @@ -420,11 +420,11 @@ def easy(x, y): traced = torch.jit.trace( easy, (torch.randint(TENSOR_LEN, (TENSOR_LEN,), dtype=torch.int8), - torch.randint(TENSOR_LEN, (TENSOR_LEN,), dtype=torch.uint8)), + torch.randint(TENSOR_LEN, (TENSOR_LEN,), dtype=torch.int8)), ) a = torch.randint(TENSOR_LEN, (TENSOR_LEN,), dtype=torch.int8) - b = torch.randint(TENSOR_LEN, (TENSOR_LEN,), dtype=torch.uint8) + b = torch.randint(TENSOR_LEN, (TENSOR_LEN,), dtype=torch.int8) x = warmup_and_run_forward(traced, a, b) self.assertLastGraphAllFused() np.testing.assert_allclose((a.numpy() + b.numpy()) * b.numpy(), x.numpy()) diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 6f587b910866..8a71e52db556 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -739,15 +739,29 @@ class TensorExprFuser { }; // clang-format on - // Value is only supported if operands are floats. - if (node->isMemberOf(float_only_operator_set)) { - for (const Value* v : node->inputs()) { - if (auto const& tt = v->type()->cast()) { - auto const& st = tt->scalarType(); - if (!st || !isFloatingType(*st)) { - return false; - } - } else if (!v->type()->cast()) { + for (const Value* v : node->inputs()) { + if (auto const& tt = v->type()->cast()) { + auto const& st = tt->scalarType(); + + // All tensors must be typed. + if (!st) { + return false; + } + + // Byte tensors introduce too many corner cases in type promotion. + // Better not to try to handle them. + if (*st == c10::ScalarType::Byte) { + return false; + } + + // These operators only support floats, because integer divisors need to + // raise ZeroDivisionError. + if (node->isMemberOf(float_only_operator_set) && !isFloatingType(*st)) { + return false; + } + } else if (node->isMemberOf(float_only_operator_set)) { + // Check scalar operands of float-only ops. + if (!v->type()->cast()) { return false; } } From bbeee481c3d5b570f98104082027e721f5f6fc53 Mon Sep 17 00:00:00 2001 From: Zain Patel Date: Mon, 14 Dec 2020 19:14:02 -0800 Subject: [PATCH 243/250] Fix typo in torch.load docstring for the `f` parameter (#49350) Summary: No issue opened for this (that I can see) and it was a fairly small change, so just opening this PR directly! The docstring for `torch.load` had some of parameter descriptions including typos like ``:meth`readline` `` instead of``:meth:`readline` ``. This PR corrects that :) image Pull Request resolved: https://github.com/pytorch/pytorch/pull/49350 Reviewed By: glaringlee Differential Revision: D25543041 Pulled By: mrshenli fbshipit-source-id: 10db04d58dd5b07777bdd51d3fcb3c45dea4c84b --- torch/serialization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/serialization.py b/torch/serialization.py index 7ae6abafa232..ebc5d0a08541 100644 --- a/torch/serialization.py +++ b/torch/serialization.py @@ -524,7 +524,7 @@ def load(f, map_location=None, pickle_module=pickle, **pickle_load_args): deserialization methods using :func:`torch.serialization.register_package`. Args: - f: a file-like object (has to implement :meth:`read`, :meth`readline`, :meth`tell`, and :meth`seek`), + f: a file-like object (has to implement :meth:`read`, :meth:`readline`, :meth:`tell`, and :meth:`seek`), or a string or os.PathLike object containing a file name map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage locations From 900aa4ee975752e25a3a63037df080abca361c97 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Dec 2020 20:00:57 -0800 Subject: [PATCH 244/250] [PyTorch] remove convenience RecordFunctionCallback interface (#48620) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48620 In preparation for storing bare function pointer (8 bytes) instead of std::function (32 bytes). ghstack-source-id: 118568242 Test Plan: CI Reviewed By: ezyang Differential Revision: D25132183 fbshipit-source-id: 3790cfb5d98479a46cf665b14eb0041a872c13da --- .../src/main/cpp/pytorch_jni_jit.cpp | 6 +- aten/src/ATen/record_function.h | 11 ---- binaries/record_function_benchmark.cc | 7 +- test/cpp/jit/test_misc.cpp | 65 +++++++++---------- torch/csrc/autograd/init.cpp | 4 +- torch/csrc/autograd/profiler_legacy.cpp | 6 +- 6 files changed, 45 insertions(+), 54 deletions(-) diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp index e4bb4c083160..9cc71f117d93 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp @@ -90,13 +90,13 @@ class PytorchJni : public facebook::jni::HybridClass { #endif #ifdef TRACE_ENABLED - static bool onFunctionEnter( + static std::unique_ptr onFunctionEnter( const at::RecordFunction& fn) { Trace::beginSection(fn.name().str()); - return true; + return nullptr; } - static void onFunctionExit(const at::RecordFunction&) { + static void onFunctionExit(const at::RecordFunction&, at::ObserverContext*) { Trace::endSection(); } #endif diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index bcd0fbc37e77..6b2e08576068 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -316,17 +316,6 @@ class TORCH_API RecordFunctionCallback { scopes_.fill(true); } - // This interface is for observers that do not pass an ObserverContext object - // between start and end callbacks. - explicit RecordFunctionCallback( - std::function start, - std::function end = - [](const RecordFunction&) {}): - start_{[start](const RecordFunction& rf) { start(rf); return nullptr; }}, - end_{[end](const RecordFunction& rf, ObserverContext*) { end(rf); }} { - scopes_.fill(true); - } - RecordFunctionCallback& needsInputs(bool needs_inputs) { needs_inputs_ = needs_inputs; return *this; diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index 53a8bd16f43d..d47cedada40f 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -19,11 +19,11 @@ const float kLowSamplingProb = 0.0001; void addTestCallback( double sampling_prob = 1.0, - std::function fn = - [](const at::RecordFunction&) {}) { + std::function(const at::RecordFunction&)> fn = + [](const at::RecordFunction&) { return nullptr; }) { auto cb = at::RecordFunctionCallback( std::move(fn), - [](const at::RecordFunction&) {}) + [](const at::RecordFunction&, at::ObserverContext*) {}) .needsInputs(false); if (sampling_prob < 1.0) { cb.samplingProb(sampling_prob); @@ -111,6 +111,7 @@ int main(int argc, char** argv) { kLowSamplingProb, [&](const at::RecordFunction& fn) { ++cb_count; + return nullptr; } ); diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index ca4ba0fdb3da..10f36cc8e394 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -739,8 +739,8 @@ void checkScopeCallbacks() { std::string(fn.name().str()) == "test_user_scope") { found_user_scope = true; } - }, - [](const at::RecordFunction&) {})); + return nullptr; + })); bool bad_scope = false; auto pushScopedCallback = [&](at::RecordScope scope, size_t& cnt) { @@ -752,9 +752,8 @@ void checkScopeCallbacks() { } else { bad_scope = true; } - return true; - }, - [](const at::RecordFunction&) {}) + return nullptr; + }) .scopes({scope})); }; @@ -813,8 +812,8 @@ TEST(RecordFunctionTest, Basic) { } else if (fn.scope() == RecordScope::TORCHSCRIPT_FUNCTION) { ts_names.insert(fn.name().str()); } - }, - [](const RecordFunction&) {}) + return nullptr; + }) .needsInputs(true)); TracedTestInputs eager_inputs, jit_inputs; @@ -851,9 +850,8 @@ TEST(RecordFunctionTest, Basic) { if (std::string(fn.name().str()) == "test") { ++sampled_cb_ctr; } - return true; - }, - [](const RecordFunction&) {}) + return nullptr; + }) .samplingProb(sampling_prob)); }; @@ -863,9 +861,8 @@ TEST(RecordFunctionTest, Basic) { if (std::string(fn.name().str()) == "test") { ++non_sampled_cb_ctr; } - return true; - }, - [](const RecordFunction&) {})); + return nullptr; + })); auto handle = setup_sampled_callback(0.5); @@ -908,9 +905,8 @@ TEST(RecordFunctionTest, Basic) { [&fn_names, &mtx](const RecordFunction& fn) { std::lock_guard lock(mtx); fn_names.push_back(fn.name().str()); - return true; - }, - [](const RecordFunction&) {})); + return nullptr; + })); { RecordFunctionGuard g1(false); { @@ -934,8 +930,10 @@ TEST(RecordFunctionTest, Basic) { std::vector ids; auto add_remove_test_add_cb = [&ids](size_t id) { return addGlobalCallback(RecordFunctionCallback( - [&ids, id](const RecordFunction& fn) { ids.push_back(id); }, - [](const RecordFunction&) {})); + [&ids, id](const RecordFunction& fn) { + ids.push_back(id); + return nullptr ; + })); }; auto h1 = add_remove_test_add_cb(1); @@ -972,8 +970,7 @@ TEST(RecordFunctionTest, Basic) { ids.clear(); addGlobalCallback(RecordFunctionCallback( - [&ids](const RecordFunction& fn) { ids.push_back(1); }, - [](const RecordFunction&) {})); + [&ids](const RecordFunction& fn) { ids.push_back(1); return nullptr; })); { RECORD_USER_SCOPE("test"); } @@ -983,8 +980,7 @@ TEST(RecordFunctionTest, Basic) { auto th = std::thread([&ids]() { addThreadLocalCallback(RecordFunctionCallback( - [&ids](const RecordFunction& fn) { ids.push_back(2); }, - [](const RecordFunction&) {})); + [&ids](const RecordFunction& fn) { ids.push_back(2); return nullptr; })); { RECORD_USER_SCOPE("test_thread"); } }); @@ -1070,8 +1066,7 @@ TEST(RecordFunctionTest, Basic) { bool ran = false; should_run = false; addGlobalCallback(RecordFunctionCallback( - [&ran](const RecordFunction& fn) { ran = true; }, - [](const RecordFunction&) {}) + [&ran](const RecordFunction& fn) { ran = true; return nullptr; }) .setShouldRun(shouldRunCallback)); { RECORD_USER_SCOPE("test"); } @@ -1093,8 +1088,8 @@ TEST(RecordFunctionTest, Basic) { auto handle = addThreadLocalCallback(RecordFunctionCallback( [&recorded_op](const RecordFunction& fn) { recorded_op = fn.name().str(); - }, - [](const RecordFunction&) {})); + return nullptr; + })); ThreadLocalState state; std::thread t_child([state]() { ThreadLocalStateGuard g_tls(state); @@ -1111,16 +1106,20 @@ TEST(RecordFunctionTest, Basic) { bool has_ids = false; addGlobalCallback( RecordFunctionCallback( - [&has_ids](const RecordFunction& fn) { has_ids = fn.handle() > 0; }, - [](const RecordFunction&) {}) + [&has_ids](const RecordFunction& fn) { + has_ids = fn.handle() > 0; + return nullptr; + }) .needsIds(true)); { RECORD_USER_SCOPE("test"); } TORCH_CHECK(has_ids); clearCallbacks(); has_ids = false; addGlobalCallback(RecordFunctionCallback( - [&has_ids](const RecordFunction& fn) { has_ids = fn.handle() > 0; }, - [](const RecordFunction&) {})); + [&has_ids](const RecordFunction& fn) { + has_ids = fn.handle() > 0; + return nullptr; + })); { RECORD_USER_SCOPE("test"); } TORCH_CHECK(!has_ids); clearCallbacks(); @@ -1138,6 +1137,7 @@ TEST(RecordFunctionTest, OperatorNameOverload) { } else { operator_names.insert("No Operator Name"); } + return nullptr; }) .scopes({at::RecordScope::FUNCTION})); auto t = torch::randn({1, 2, 3}, at::kCPU); @@ -1209,9 +1209,8 @@ TEST(ThreadLocalDebugInfoTest, Basic) { [&done](const RecordFunction&) { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); done = true; - return true; - }, - [](const RecordFunction&) {})); + return nullptr; + })); { c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info); auto t = torch::randn({1, 2, 3}, at::kCPU); diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 78336ded0d88..488b7be9bd8a 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -173,8 +173,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { }); m.def("_set_empty_test_observer", [](bool is_global, double sampling_prob) { auto cb = at::RecordFunctionCallback( - [](const at::RecordFunction&) {}, - [](const at::RecordFunction&) {}) + [](const at::RecordFunction&) { return nullptr; }, + [](const at::RecordFunction&, at::ObserverContext*) {}) .needsInputs(true) .samplingProb(sampling_prob); if (is_global) { diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index 88cf22321865..eb52aec8920d 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -417,7 +417,7 @@ void pushProfilingCallbacksLegacy() { [](const at::RecordFunction& fn) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { - return; + return nullptr; } bool record_cuda = state_ptr->config().state == ProfilerState::CUDA; @@ -432,8 +432,10 @@ void pushProfilingCallbacksLegacy() { } else { state_ptr->pushRange(fn, record_cuda, msg); } + + return nullptr; }, - [](const at::RecordFunction& fn) { + [](const at::RecordFunction& fn, at::ObserverContext*) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; From 7e23ee1598c916a835801ca67c74e0ed8b0b7d82 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Dec 2020 20:00:57 -0800 Subject: [PATCH 245/250] [PyTorch] Use plain old function pointer for RecordFunctionCallback (#48629) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48629 Nearly every non-test callsite doesn't need to capture any variables anyway, and this saves 48 bytes per callback. ghstack-source-id: 118568240 Test Plan: CI Reviewed By: dhruvbird Differential Revision: D25135415 fbshipit-source-id: 5e92dc79da6473ed15d1e381a21ed315879168f3 --- aten/src/ATen/record_function.cpp | 6 +- aten/src/ATen/record_function.h | 20 +- binaries/record_function_benchmark.cc | 10 +- test/cpp/jit/test_misc.cpp | 269 ++++++++++++++---------- torch/csrc/autograd/init.cpp | 4 +- torch/csrc/autograd/profiler_legacy.cpp | 2 +- 6 files changed, 175 insertions(+), 136 deletions(-) diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index d1b0acb87c28..a75b1a1295db 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -277,10 +277,12 @@ class CallbackManager { bool is_start) { try { if (is_start) { - ctx = rfcb.start()(rf); + ctx = rfcb.start() ? rfcb.start()(rf) : nullptr; } else { - rfcb.end()(rf, ctx.get()); + if (rfcb.end()) { + rfcb.end()(rf, ctx.get()); + } } return true; } catch (const std::exception &e) { diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index 6b2e08576068..e9939667feb7 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -305,14 +305,16 @@ struct TORCH_API RecordFunction { */ class TORCH_API RecordFunctionCallback { public: + using StartCallback = std::unique_ptr(*)(const RecordFunction&); + using EndCallback = void (*)(const RecordFunction&, ObserverContext*); + // This interface supports observers that require passing an ObserverContext // between start and end callbacks. explicit RecordFunctionCallback( - std::function(const RecordFunction&)> start, - std::function end = - [](const RecordFunction&, ObserverContext*) {}): - start_(std::move(start)), - end_(std::move(end)) { + StartCallback start, + EndCallback end = nullptr) : + start_(start), + end_(end) { scopes_.fill(true); } @@ -368,18 +370,18 @@ class TORCH_API RecordFunctionCallback { return scopes_[(size_t)sc]; } - inline const std::function(const RecordFunction&)>& start() const { + inline StartCallback start() const { return start_; } - inline const std::function& end() const { + inline EndCallback end() const { return end_; } private: friend class CallbackManager; - std::function(const RecordFunction&)> start_; - std::function end_; + StartCallback start_; + EndCallback end_; bool(*should_run_)(const RecordFunctionCallback&) = nullptr; double sampling_prob_ = 1.0; std::array(RecordScope::NUM_SCOPES)> scopes_ = {}; diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index d47cedada40f..c80f46d75652 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -19,10 +19,10 @@ const float kLowSamplingProb = 0.0001; void addTestCallback( double sampling_prob = 1.0, - std::function(const at::RecordFunction&)> fn = - [](const at::RecordFunction&) { return nullptr; }) { + at::RecordFunctionCallback::StartCallback fn = + [](const at::RecordFunction&) -> std::unique_ptr { return nullptr; }) { auto cb = at::RecordFunctionCallback( - std::move(fn), + fn, [](const at::RecordFunction&, at::ObserverContext*) {}) .needsInputs(false); if (sampling_prob < 1.0) { @@ -106,10 +106,10 @@ int main(int argc, char** argv) { at::clearCallbacks(); std::cout << "Checking number of sampled observer invocations" << std::endl; - int cb_count = 0; + static int cb_count = 0; addTestCallback( kLowSamplingProb, - [&](const at::RecordFunction& fn) { + [](const at::RecordFunction&) -> std::unique_ptr { ++cb_count; return nullptr; } diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 10f36cc8e394..445da59aee50 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -721,12 +721,34 @@ void checkTracedInputs(const TracedTestInputs& inputs) { TORCH_CHECK(found_mul); } +static bool bad_scope = false; +template +std::unique_ptr checkScopeCallback(const at::RecordFunction& fn) { + if (fn.scope() == scope) { + ++(*cnt); + } else { + bad_scope = true; + } + return nullptr; +} + +template +void pushScopedCallback() { + at::addGlobalCallback( + at::RecordFunctionCallback( + checkScopeCallback) + .scopes({scope})); +} + void checkScopeCallbacks() { - bool found_function_scope = false; - bool found_method_scope = false; - bool found_user_scope = false; + static bool found_function_scope; + static bool found_method_scope; + static bool found_user_scope; + found_function_scope = false; + found_method_scope = false; + found_user_scope = false; at::addGlobalCallback(at::RecordFunctionCallback( - [&](const at::RecordFunction& fn) { + [](const at::RecordFunction& fn) -> std::unique_ptr{ if (fn.scope() == at::RecordScope::FUNCTION && std::string(fn.name().str()) == "test_function") { found_function_scope = true; @@ -742,27 +764,17 @@ void checkScopeCallbacks() { return nullptr; })); - bool bad_scope = false; - auto pushScopedCallback = [&](at::RecordScope scope, size_t& cnt) { - at::addGlobalCallback( - at::RecordFunctionCallback( - [&bad_scope, &cnt, scope](const at::RecordFunction& fn) { - if (fn.scope() == scope) { - ++cnt; - } else { - bad_scope = true; - } - return nullptr; - }) - .scopes({scope})); - }; + static size_t fun_cnt; + static size_t ts_fun_cnt; + static size_t user_scope_cnt; - size_t fun_cnt = 0; - pushScopedCallback(at::RecordScope::FUNCTION, fun_cnt); - size_t ts_fun_cnt = 0; - pushScopedCallback(at::RecordScope::TORCHSCRIPT_FUNCTION, ts_fun_cnt); - size_t user_scope_cnt = 0; - pushScopedCallback(at::RecordScope::USER_SCOPE, user_scope_cnt); + bad_scope = false; + fun_cnt = 0; + pushScopedCallback(); + ts_fun_cnt = 0; + pushScopedCallback(); + user_scope_cnt = 0; + pushScopedCallback(); TORCH_CHECK(at::hasCallbacks()); @@ -788,33 +800,33 @@ static bool shouldRunCallback(const RecordFunctionCallback&) { return should_run; } -TEST(RecordFunctionTest, Basic) { +static TracedTestInputs traced_inputs; +static std::unordered_set ts_names; + +std::unique_ptr tracedInputsCallback(const RecordFunction& fn) { + if (fn.scope() == RecordScope::FUNCTION) { + auto inputs = fn.inputs(); + std::vector> sizes; + for (const auto& input : inputs) { + if (input.isTensor()) { + sizes.push_back(input.toTensor().sizes().vec()); + } else if (input.isScalar()) { + sizes.push_back(std::vector()); + } + } + traced_inputs.push_back(std::make_tuple(fn.name().str(), sizes)); + } else if (fn.scope() == RecordScope::TORCHSCRIPT_FUNCTION) { + ts_names.insert(fn.name().str()); + } + return nullptr; +} + +TEST(RecordFunctionTest, TracedTestInputs) { // disabling the inlining of method calls GraphOptimizerEnabledGuard opt_guard(false); // [(fn, [[sizes], [sizes], ...]), ...] - TracedTestInputs traced_inputs; - std::unordered_set ts_names; - addGlobalCallback( - RecordFunctionCallback( - [&](const RecordFunction& fn) { - if (fn.scope() == RecordScope::FUNCTION) { - auto inputs = fn.inputs(); - std::vector> sizes; - for (const auto& input : inputs) { - if (input.isTensor()) { - sizes.push_back(input.toTensor().sizes().vec()); - } else if (input.isScalar()) { - sizes.push_back(std::vector()); - } - } - traced_inputs.push_back(std::make_tuple(fn.name().str(), sizes)); - } else if (fn.scope() == RecordScope::TORCHSCRIPT_FUNCTION) { - ts_names.insert(fn.name().str()); - } - return nullptr; - }) - .needsInputs(true)); + addGlobalCallback(RecordFunctionCallback(tracedInputsCallback).needsInputs(true)); TracedTestInputs eager_inputs, jit_inputs; { @@ -841,28 +853,36 @@ TEST(RecordFunctionTest, Basic) { checkTracedInputs(eager_inputs); checkTracedInputs(jit_inputs); at::clearCallbacks(); +} + +static int sampled_cb_ctr = 0; +std::unique_ptr sampledCallback(const RecordFunction& fn) { + if (std::string(fn.name().str()) == "test") { + ++sampled_cb_ctr; + } + return nullptr; +} + +static int non_sampled_cb_ctr = 0; +std::unique_ptr nonSampledCallback(const RecordFunction& fn) { + if (std::string(fn.name().str()) == "test") { + ++non_sampled_cb_ctr; + } + return nullptr; +} + +TEST(RecordFunctionTest, SampledCallbacks) { + // disabling the inlining of method calls + GraphOptimizerEnabledGuard opt_guard(false); // test sampled callbacks - int sampled_cb_ctr = 0; - auto setup_sampled_callback = [&sampled_cb_ctr](double sampling_prob) { - return addGlobalCallback(RecordFunctionCallback( - [&sampled_cb_ctr](const RecordFunction& fn) { - if (std::string(fn.name().str()) == "test") { - ++sampled_cb_ctr; - } - return nullptr; - }) + sampled_cb_ctr = 0; + auto setup_sampled_callback = [](double sampling_prob) { + return addGlobalCallback(RecordFunctionCallback(sampledCallback) .samplingProb(sampling_prob)); }; - int non_sampled_cb_ctr = 0; - addGlobalCallback(RecordFunctionCallback( - [&non_sampled_cb_ctr](const RecordFunction& fn) { - if (std::string(fn.name().str()) == "test") { - ++non_sampled_cb_ctr; - } - return nullptr; - })); + addGlobalCallback(RecordFunctionCallback(nonSampledCallback)); auto handle = setup_sampled_callback(0.5); @@ -897,13 +917,19 @@ TEST(RecordFunctionTest, Basic) { // test the scope of the callbacks checkScopeCallbacks(); clearCallbacks(); +} + +TEST(RecordFunctionTest, RecordFunctionGuard) { + // disabling the inlining of method calls + GraphOptimizerEnabledGuard opt_guard(false); + + static std::vector fn_names; + static std::mutex guard_mtx; // check record function guard - std::vector fn_names; - std::mutex mtx; addGlobalCallback(RecordFunctionCallback( - [&fn_names, &mtx](const RecordFunction& fn) { - std::lock_guard lock(mtx); + [](const RecordFunction& fn) -> std::unique_ptr{ + std::lock_guard lock(guard_mtx); fn_names.push_back(fn.name().str()); return nullptr; })); @@ -925,20 +951,26 @@ TEST(RecordFunctionTest, Basic) { TORCH_CHECK(fn_names.size() == 1); TORCH_CHECK(fn_names[0] == "B"); clearCallbacks(); +} - // test add/remove - std::vector ids; - auto add_remove_test_add_cb = [&ids](size_t id) { - return addGlobalCallback(RecordFunctionCallback( - [&ids, id](const RecordFunction& fn) { - ids.push_back(id); - return nullptr ; - })); - }; +static std::vector ids; - auto h1 = add_remove_test_add_cb(1); - auto h2 = add_remove_test_add_cb(2); - auto h3 = add_remove_test_add_cb(3); +template +auto add_remove_test_add_cb() { + return addGlobalCallback(RecordFunctionCallback( + [](const RecordFunction& fn) -> std::unique_ptr { + ids.push_back(id); + return nullptr; + })); +} + +TEST(RecordFunctionTest, Callbacks) { + // disabling the inlining of method calls + GraphOptimizerEnabledGuard opt_guard(false); + + auto h1 = add_remove_test_add_cb<1>(); + auto h2 = add_remove_test_add_cb<2>(); + auto h3 = add_remove_test_add_cb<3>(); { RECORD_USER_SCOPE("test"); } @@ -969,8 +1001,7 @@ TEST(RecordFunctionTest, Basic) { // thread local / global callbacks ids.clear(); - addGlobalCallback(RecordFunctionCallback( - [&ids](const RecordFunction& fn) { ids.push_back(1); return nullptr; })); + add_remove_test_add_cb<1>(); { RECORD_USER_SCOPE("test"); } @@ -978,9 +1009,9 @@ TEST(RecordFunctionTest, Basic) { TORCH_CHECK(ids[0] == 1); ids.clear(); - auto th = std::thread([&ids]() { + auto th = std::thread([]() { addThreadLocalCallback(RecordFunctionCallback( - [&ids](const RecordFunction& fn) { ids.push_back(2); return nullptr; })); + [](const RecordFunction& fn) -> std::unique_ptr { ids.push_back(2); return nullptr; })); { RECORD_USER_SCOPE("test_thread"); } }); @@ -1005,22 +1036,19 @@ TEST(RecordFunctionTest, Basic) { }; ids.clear(); { // START: global test - const int test_val = 123; - const std::string test_str = "test str"; addGlobalCallback(RecordFunctionCallback( - [test_val, test_str, &ids](const RecordFunction& /* unused */) { + [](const RecordFunction& /* unused */) -> std::unique_ptr { auto ctx = std::make_unique(); - ctx->a = test_val; - ctx->b = test_str; + ctx->a = 123; + ctx->b = "test_str"; ids.push_back(1); return ctx; }, - [test_val, test_str]( - const RecordFunction& /* unused */, ObserverContext* ctx_ptr) { + [](const RecordFunction& /* unused */, ObserverContext* ctx_ptr) { auto ctx = dynamic_cast(ctx_ptr); TORCH_CHECK(ctx_ptr != nullptr); - TORCH_CHECK(ctx->a == test_val); - TORCH_CHECK(ctx->b == test_str); + TORCH_CHECK(ctx->a == 123); + TORCH_CHECK(ctx->b == "test_str"); })); { RECORD_USER_SCOPE("test"); } @@ -1030,23 +1058,23 @@ TEST(RecordFunctionTest, Basic) { ids.clear(); } // END: global test { // START: thread local test - auto ctx_th = std::thread([&ids]() { + auto ctx_th = std::thread([]() { const int test_val = 234; const std::string test_str = "test thread str"; addThreadLocalCallback(RecordFunctionCallback( - [test_val, test_str, &ids](const RecordFunction& /* unused */) { + [](const RecordFunction& /* unused */) -> std::unique_ptr { auto ctx = std::make_unique(); - ctx->a = test_val; - ctx->b = test_str; + ctx->a = 234; + ctx->b = "test_thread_str"; ids.push_back(2); return ctx; }, - [test_val, test_str]( + []( const RecordFunction& /* unused */, ObserverContext* ctx_ptr) { auto ctx = dynamic_cast(ctx_ptr); TORCH_CHECK(ctx_ptr != nullptr); - TORCH_CHECK(ctx->a == test_val); - TORCH_CHECK(ctx->b == test_str); + TORCH_CHECK(ctx->a == 234); + TORCH_CHECK(ctx->b == "test_thread_str"); })); // Will call both global and thread local callbacks. @@ -1060,13 +1088,16 @@ TEST(RecordFunctionTest, Basic) { } // END: thread local test clearCallbacks(); +} - // test should_run +TEST(RecordFunctionTest, ShouldRun) { + // disabling the inlining of method calls + GraphOptimizerEnabledGuard opt_guard(false); - bool ran = false; should_run = false; + static bool ran = false; addGlobalCallback(RecordFunctionCallback( - [&ran](const RecordFunction& fn) { ran = true; return nullptr; }) + [](const RecordFunction& fn) -> std::unique_ptr { ran = true; return nullptr; }) .setShouldRun(shouldRunCallback)); { RECORD_USER_SCOPE("test"); } @@ -1080,13 +1111,20 @@ TEST(RecordFunctionTest, Basic) { TORCH_CHECK(ran); clearCallbacks(); +} + +TEST(RecordFunctionTest, Basic) { + // disabling the inlining of method calls + GraphOptimizerEnabledGuard opt_guard(false); + + static std::string recorded_op; + static bool has_ids = false; // test propagation of TLS callbacks std::thread t([]() { RecordFunctionGuard enable_rec_fn; - std::string recorded_op; auto handle = addThreadLocalCallback(RecordFunctionCallback( - [&recorded_op](const RecordFunction& fn) { + [](const RecordFunction& fn) -> std::unique_ptr { recorded_op = fn.name().str(); return nullptr; })); @@ -1096,17 +1134,16 @@ TEST(RecordFunctionTest, Basic) { RECORD_USER_SCOPE("test_in_thread"); }); t_child.join(); - TORCH_CHECK(recorded_op == "test_in_thread"); + EXPECT_EQ(recorded_op, "test_in_thread"); removeCallback(handle); }); t.join(); clearCallbacks(); // test set ids - bool has_ids = false; addGlobalCallback( RecordFunctionCallback( - [&has_ids](const RecordFunction& fn) { + [](const RecordFunction& fn) -> std::unique_ptr { has_ids = fn.handle() > 0; return nullptr; }) @@ -1116,7 +1153,7 @@ TEST(RecordFunctionTest, Basic) { clearCallbacks(); has_ids = false; addGlobalCallback(RecordFunctionCallback( - [&has_ids](const RecordFunction& fn) { + [](const RecordFunction& fn) -> std::unique_ptr { has_ids = fn.handle() > 0; return nullptr; })); @@ -1126,10 +1163,9 @@ TEST(RecordFunctionTest, Basic) { } TEST(RecordFunctionTest, OperatorNameOverload) { - std::set operator_names; - + static std::set operator_names; at::addGlobalCallback(at::RecordFunctionCallback( - [&operator_names](const at::RecordFunction& fn) { + [](const at::RecordFunction& fn) -> std::unique_ptr { c10::optional op_name = fn.operator_name(); if (op_name.has_value()) { @@ -1178,6 +1214,8 @@ void checkDebugInfo(c10::DebugInfoKind kind, int model_id) { } TEST(ThreadLocalDebugInfoTest, Basic) { + static std::atomic done{false}; + TORCH_CHECK( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); auto debug_info = std::make_shared(); @@ -1190,10 +1228,9 @@ TEST(ThreadLocalDebugInfoTest, Basic) { // check that thread local debug info is propagated through fork calls TORCH_CHECK( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); - std::atomic done{false}; { c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info); - at::launch([&done]() { + at::launch([]() { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); done = true; }); @@ -1206,7 +1243,7 @@ TEST(ThreadLocalDebugInfoTest, Basic) { c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); done = false; auto handle = addGlobalCallback(RecordFunctionCallback( - [&done](const RecordFunction&) { + [](const RecordFunction&) -> std::unique_ptr { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); done = true; return nullptr; @@ -1236,7 +1273,7 @@ TEST(ThreadLocalDebugInfoTest, Basic) { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); checkDebugInfo(c10::DebugInfoKind::TEST_INFO_2, 314); done = false; - at::launch([&done]() { + at::launch([]() { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); checkDebugInfo(c10::DebugInfoKind::TEST_INFO_2, 314); done = true; diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 488b7be9bd8a..7bf11a4d6316 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -172,9 +172,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { at::enableRecordFunction(enable); }); m.def("_set_empty_test_observer", [](bool is_global, double sampling_prob) { - auto cb = at::RecordFunctionCallback( - [](const at::RecordFunction&) { return nullptr; }, - [](const at::RecordFunction&, at::ObserverContext*) {}) + auto cb = at::RecordFunctionCallback(nullptr) .needsInputs(true) .samplingProb(sampling_prob); if (is_global) { diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index eb52aec8920d..d478aa509822 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -414,7 +414,7 @@ void pushProfilingCallbacksLegacy() { auto state_ptr = getProfilerTLSState(); TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) { + [](const at::RecordFunction& fn) -> std::unique_ptr{ auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return nullptr; From a419a3e25d0120a9bffbb5d8fe50fd525e00404c Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 14 Dec 2020 20:13:57 -0800 Subject: [PATCH 246/250] Add assertion on any NaN error on the error feedback (#49374) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/49374 After the assertion is added, the NaN error on certain trainings disappears. It seems that the real error is caused by the underlying illegal memory access. This is a temporary workaround. Original PR issue: Investigate Applying PowerSGD to Communication Hook for Gradient Compression #47202 ghstack-source-id: 118572471 Test Plan: Real run on Ads 10X model: scripts/wayi/mast_prof_gradient_compression.sh POWER_SGD 8 To reproduce the error, just comment out the assertion. Reviewed By: rohan-varma Differential Revision: D25548299 fbshipit-source-id: 039af7d94a27e0f47ef647c6163fd0e5064951d5 --- .../distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index bbcef98d4214..751621189706 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -126,6 +126,7 @@ def powerSGD_hook( # Incorporate the error from the previous state into the gradients. bucket_index = bucket.get_index() + input_tensor_cp = None if state.use_error_feedback: # The buckets can be rebuilt during training. # In this case, the error tensor shape will not be aligned with the input tensor, @@ -189,9 +190,7 @@ def compute_q(fut): torch.matmul(matrix.t(), p, out=q) return [ - dist.all_reduce(q, group=group_to_use, async_op=True) - .get_future() - .wait()[0] + dist.all_reduce(q, group=group_to_use, async_op=True).get_future().wait()[0] ] def decompress(fut): @@ -201,6 +200,7 @@ def decompress(fut): if state.use_error_feedback: # Memorize the local errors. state.error_dict[bucket_index] = input_tensor_cp - input_tensor + assert not torch.any(torch.isnan(state.error_dict[bucket_index])) ret = input_tensor.resize_(total_length) return [ret] From 25bc90628109699ebd72a5a4bf68750f98f9251c Mon Sep 17 00:00:00 2001 From: Mike Ruberry Date: Mon, 14 Dec 2020 21:02:37 -0800 Subject: [PATCH 247/250] Revert D25135415: [PyTorch] Use plain old function pointer for RecordFunctionCallback Test Plan: revert-hammer Differential Revision: D25135415 (https://github.com/pytorch/pytorch/commit/7e23ee1598c916a835801ca67c74e0ed8b0b7d82) Original commit changeset: 5e92dc79da64 fbshipit-source-id: 45b1634a100084c84dca158a1f16ca760fef6988 --- aten/src/ATen/record_function.cpp | 6 +- aten/src/ATen/record_function.h | 20 +- binaries/record_function_benchmark.cc | 10 +- test/cpp/jit/test_misc.cpp | 269 ++++++++++-------------- torch/csrc/autograd/init.cpp | 4 +- torch/csrc/autograd/profiler_legacy.cpp | 2 +- 6 files changed, 136 insertions(+), 175 deletions(-) diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index a75b1a1295db..d1b0acb87c28 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -277,12 +277,10 @@ class CallbackManager { bool is_start) { try { if (is_start) { - ctx = rfcb.start() ? rfcb.start()(rf) : nullptr; + ctx = rfcb.start()(rf); } else { - if (rfcb.end()) { - rfcb.end()(rf, ctx.get()); - } + rfcb.end()(rf, ctx.get()); } return true; } catch (const std::exception &e) { diff --git a/aten/src/ATen/record_function.h b/aten/src/ATen/record_function.h index e9939667feb7..6b2e08576068 100644 --- a/aten/src/ATen/record_function.h +++ b/aten/src/ATen/record_function.h @@ -305,16 +305,14 @@ struct TORCH_API RecordFunction { */ class TORCH_API RecordFunctionCallback { public: - using StartCallback = std::unique_ptr(*)(const RecordFunction&); - using EndCallback = void (*)(const RecordFunction&, ObserverContext*); - // This interface supports observers that require passing an ObserverContext // between start and end callbacks. explicit RecordFunctionCallback( - StartCallback start, - EndCallback end = nullptr) : - start_(start), - end_(end) { + std::function(const RecordFunction&)> start, + std::function end = + [](const RecordFunction&, ObserverContext*) {}): + start_(std::move(start)), + end_(std::move(end)) { scopes_.fill(true); } @@ -370,18 +368,18 @@ class TORCH_API RecordFunctionCallback { return scopes_[(size_t)sc]; } - inline StartCallback start() const { + inline const std::function(const RecordFunction&)>& start() const { return start_; } - inline EndCallback end() const { + inline const std::function& end() const { return end_; } private: friend class CallbackManager; - StartCallback start_; - EndCallback end_; + std::function(const RecordFunction&)> start_; + std::function end_; bool(*should_run_)(const RecordFunctionCallback&) = nullptr; double sampling_prob_ = 1.0; std::array(RecordScope::NUM_SCOPES)> scopes_ = {}; diff --git a/binaries/record_function_benchmark.cc b/binaries/record_function_benchmark.cc index c80f46d75652..d47cedada40f 100644 --- a/binaries/record_function_benchmark.cc +++ b/binaries/record_function_benchmark.cc @@ -19,10 +19,10 @@ const float kLowSamplingProb = 0.0001; void addTestCallback( double sampling_prob = 1.0, - at::RecordFunctionCallback::StartCallback fn = - [](const at::RecordFunction&) -> std::unique_ptr { return nullptr; }) { + std::function(const at::RecordFunction&)> fn = + [](const at::RecordFunction&) { return nullptr; }) { auto cb = at::RecordFunctionCallback( - fn, + std::move(fn), [](const at::RecordFunction&, at::ObserverContext*) {}) .needsInputs(false); if (sampling_prob < 1.0) { @@ -106,10 +106,10 @@ int main(int argc, char** argv) { at::clearCallbacks(); std::cout << "Checking number of sampled observer invocations" << std::endl; - static int cb_count = 0; + int cb_count = 0; addTestCallback( kLowSamplingProb, - [](const at::RecordFunction&) -> std::unique_ptr { + [&](const at::RecordFunction& fn) { ++cb_count; return nullptr; } diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 445da59aee50..10f36cc8e394 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -721,34 +721,12 @@ void checkTracedInputs(const TracedTestInputs& inputs) { TORCH_CHECK(found_mul); } -static bool bad_scope = false; -template -std::unique_ptr checkScopeCallback(const at::RecordFunction& fn) { - if (fn.scope() == scope) { - ++(*cnt); - } else { - bad_scope = true; - } - return nullptr; -} - -template -void pushScopedCallback() { - at::addGlobalCallback( - at::RecordFunctionCallback( - checkScopeCallback) - .scopes({scope})); -} - void checkScopeCallbacks() { - static bool found_function_scope; - static bool found_method_scope; - static bool found_user_scope; - found_function_scope = false; - found_method_scope = false; - found_user_scope = false; + bool found_function_scope = false; + bool found_method_scope = false; + bool found_user_scope = false; at::addGlobalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) -> std::unique_ptr{ + [&](const at::RecordFunction& fn) { if (fn.scope() == at::RecordScope::FUNCTION && std::string(fn.name().str()) == "test_function") { found_function_scope = true; @@ -764,17 +742,27 @@ void checkScopeCallbacks() { return nullptr; })); - static size_t fun_cnt; - static size_t ts_fun_cnt; - static size_t user_scope_cnt; + bool bad_scope = false; + auto pushScopedCallback = [&](at::RecordScope scope, size_t& cnt) { + at::addGlobalCallback( + at::RecordFunctionCallback( + [&bad_scope, &cnt, scope](const at::RecordFunction& fn) { + if (fn.scope() == scope) { + ++cnt; + } else { + bad_scope = true; + } + return nullptr; + }) + .scopes({scope})); + }; - bad_scope = false; - fun_cnt = 0; - pushScopedCallback(); - ts_fun_cnt = 0; - pushScopedCallback(); - user_scope_cnt = 0; - pushScopedCallback(); + size_t fun_cnt = 0; + pushScopedCallback(at::RecordScope::FUNCTION, fun_cnt); + size_t ts_fun_cnt = 0; + pushScopedCallback(at::RecordScope::TORCHSCRIPT_FUNCTION, ts_fun_cnt); + size_t user_scope_cnt = 0; + pushScopedCallback(at::RecordScope::USER_SCOPE, user_scope_cnt); TORCH_CHECK(at::hasCallbacks()); @@ -800,33 +788,33 @@ static bool shouldRunCallback(const RecordFunctionCallback&) { return should_run; } -static TracedTestInputs traced_inputs; -static std::unordered_set ts_names; - -std::unique_ptr tracedInputsCallback(const RecordFunction& fn) { - if (fn.scope() == RecordScope::FUNCTION) { - auto inputs = fn.inputs(); - std::vector> sizes; - for (const auto& input : inputs) { - if (input.isTensor()) { - sizes.push_back(input.toTensor().sizes().vec()); - } else if (input.isScalar()) { - sizes.push_back(std::vector()); - } - } - traced_inputs.push_back(std::make_tuple(fn.name().str(), sizes)); - } else if (fn.scope() == RecordScope::TORCHSCRIPT_FUNCTION) { - ts_names.insert(fn.name().str()); - } - return nullptr; -} - -TEST(RecordFunctionTest, TracedTestInputs) { +TEST(RecordFunctionTest, Basic) { // disabling the inlining of method calls GraphOptimizerEnabledGuard opt_guard(false); // [(fn, [[sizes], [sizes], ...]), ...] - addGlobalCallback(RecordFunctionCallback(tracedInputsCallback).needsInputs(true)); + TracedTestInputs traced_inputs; + std::unordered_set ts_names; + addGlobalCallback( + RecordFunctionCallback( + [&](const RecordFunction& fn) { + if (fn.scope() == RecordScope::FUNCTION) { + auto inputs = fn.inputs(); + std::vector> sizes; + for (const auto& input : inputs) { + if (input.isTensor()) { + sizes.push_back(input.toTensor().sizes().vec()); + } else if (input.isScalar()) { + sizes.push_back(std::vector()); + } + } + traced_inputs.push_back(std::make_tuple(fn.name().str(), sizes)); + } else if (fn.scope() == RecordScope::TORCHSCRIPT_FUNCTION) { + ts_names.insert(fn.name().str()); + } + return nullptr; + }) + .needsInputs(true)); TracedTestInputs eager_inputs, jit_inputs; { @@ -853,36 +841,28 @@ TEST(RecordFunctionTest, TracedTestInputs) { checkTracedInputs(eager_inputs); checkTracedInputs(jit_inputs); at::clearCallbacks(); -} - -static int sampled_cb_ctr = 0; -std::unique_ptr sampledCallback(const RecordFunction& fn) { - if (std::string(fn.name().str()) == "test") { - ++sampled_cb_ctr; - } - return nullptr; -} - -static int non_sampled_cb_ctr = 0; -std::unique_ptr nonSampledCallback(const RecordFunction& fn) { - if (std::string(fn.name().str()) == "test") { - ++non_sampled_cb_ctr; - } - return nullptr; -} - -TEST(RecordFunctionTest, SampledCallbacks) { - // disabling the inlining of method calls - GraphOptimizerEnabledGuard opt_guard(false); // test sampled callbacks - sampled_cb_ctr = 0; - auto setup_sampled_callback = [](double sampling_prob) { - return addGlobalCallback(RecordFunctionCallback(sampledCallback) + int sampled_cb_ctr = 0; + auto setup_sampled_callback = [&sampled_cb_ctr](double sampling_prob) { + return addGlobalCallback(RecordFunctionCallback( + [&sampled_cb_ctr](const RecordFunction& fn) { + if (std::string(fn.name().str()) == "test") { + ++sampled_cb_ctr; + } + return nullptr; + }) .samplingProb(sampling_prob)); }; - addGlobalCallback(RecordFunctionCallback(nonSampledCallback)); + int non_sampled_cb_ctr = 0; + addGlobalCallback(RecordFunctionCallback( + [&non_sampled_cb_ctr](const RecordFunction& fn) { + if (std::string(fn.name().str()) == "test") { + ++non_sampled_cb_ctr; + } + return nullptr; + })); auto handle = setup_sampled_callback(0.5); @@ -917,19 +897,13 @@ TEST(RecordFunctionTest, SampledCallbacks) { // test the scope of the callbacks checkScopeCallbacks(); clearCallbacks(); -} - -TEST(RecordFunctionTest, RecordFunctionGuard) { - // disabling the inlining of method calls - GraphOptimizerEnabledGuard opt_guard(false); - - static std::vector fn_names; - static std::mutex guard_mtx; // check record function guard + std::vector fn_names; + std::mutex mtx; addGlobalCallback(RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr{ - std::lock_guard lock(guard_mtx); + [&fn_names, &mtx](const RecordFunction& fn) { + std::lock_guard lock(mtx); fn_names.push_back(fn.name().str()); return nullptr; })); @@ -951,26 +925,20 @@ TEST(RecordFunctionTest, RecordFunctionGuard) { TORCH_CHECK(fn_names.size() == 1); TORCH_CHECK(fn_names[0] == "B"); clearCallbacks(); -} -static std::vector ids; - -template -auto add_remove_test_add_cb() { - return addGlobalCallback(RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr { - ids.push_back(id); - return nullptr; - })); -} - -TEST(RecordFunctionTest, Callbacks) { - // disabling the inlining of method calls - GraphOptimizerEnabledGuard opt_guard(false); + // test add/remove + std::vector ids; + auto add_remove_test_add_cb = [&ids](size_t id) { + return addGlobalCallback(RecordFunctionCallback( + [&ids, id](const RecordFunction& fn) { + ids.push_back(id); + return nullptr ; + })); + }; - auto h1 = add_remove_test_add_cb<1>(); - auto h2 = add_remove_test_add_cb<2>(); - auto h3 = add_remove_test_add_cb<3>(); + auto h1 = add_remove_test_add_cb(1); + auto h2 = add_remove_test_add_cb(2); + auto h3 = add_remove_test_add_cb(3); { RECORD_USER_SCOPE("test"); } @@ -1001,7 +969,8 @@ TEST(RecordFunctionTest, Callbacks) { // thread local / global callbacks ids.clear(); - add_remove_test_add_cb<1>(); + addGlobalCallback(RecordFunctionCallback( + [&ids](const RecordFunction& fn) { ids.push_back(1); return nullptr; })); { RECORD_USER_SCOPE("test"); } @@ -1009,9 +978,9 @@ TEST(RecordFunctionTest, Callbacks) { TORCH_CHECK(ids[0] == 1); ids.clear(); - auto th = std::thread([]() { + auto th = std::thread([&ids]() { addThreadLocalCallback(RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr { ids.push_back(2); return nullptr; })); + [&ids](const RecordFunction& fn) { ids.push_back(2); return nullptr; })); { RECORD_USER_SCOPE("test_thread"); } }); @@ -1036,19 +1005,22 @@ TEST(RecordFunctionTest, Callbacks) { }; ids.clear(); { // START: global test + const int test_val = 123; + const std::string test_str = "test str"; addGlobalCallback(RecordFunctionCallback( - [](const RecordFunction& /* unused */) -> std::unique_ptr { + [test_val, test_str, &ids](const RecordFunction& /* unused */) { auto ctx = std::make_unique(); - ctx->a = 123; - ctx->b = "test_str"; + ctx->a = test_val; + ctx->b = test_str; ids.push_back(1); return ctx; }, - [](const RecordFunction& /* unused */, ObserverContext* ctx_ptr) { + [test_val, test_str]( + const RecordFunction& /* unused */, ObserverContext* ctx_ptr) { auto ctx = dynamic_cast(ctx_ptr); TORCH_CHECK(ctx_ptr != nullptr); - TORCH_CHECK(ctx->a == 123); - TORCH_CHECK(ctx->b == "test_str"); + TORCH_CHECK(ctx->a == test_val); + TORCH_CHECK(ctx->b == test_str); })); { RECORD_USER_SCOPE("test"); } @@ -1058,23 +1030,23 @@ TEST(RecordFunctionTest, Callbacks) { ids.clear(); } // END: global test { // START: thread local test - auto ctx_th = std::thread([]() { + auto ctx_th = std::thread([&ids]() { const int test_val = 234; const std::string test_str = "test thread str"; addThreadLocalCallback(RecordFunctionCallback( - [](const RecordFunction& /* unused */) -> std::unique_ptr { + [test_val, test_str, &ids](const RecordFunction& /* unused */) { auto ctx = std::make_unique(); - ctx->a = 234; - ctx->b = "test_thread_str"; + ctx->a = test_val; + ctx->b = test_str; ids.push_back(2); return ctx; }, - []( + [test_val, test_str]( const RecordFunction& /* unused */, ObserverContext* ctx_ptr) { auto ctx = dynamic_cast(ctx_ptr); TORCH_CHECK(ctx_ptr != nullptr); - TORCH_CHECK(ctx->a == 234); - TORCH_CHECK(ctx->b == "test_thread_str"); + TORCH_CHECK(ctx->a == test_val); + TORCH_CHECK(ctx->b == test_str); })); // Will call both global and thread local callbacks. @@ -1088,16 +1060,13 @@ TEST(RecordFunctionTest, Callbacks) { } // END: thread local test clearCallbacks(); -} -TEST(RecordFunctionTest, ShouldRun) { - // disabling the inlining of method calls - GraphOptimizerEnabledGuard opt_guard(false); + // test should_run + bool ran = false; should_run = false; - static bool ran = false; addGlobalCallback(RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr { ran = true; return nullptr; }) + [&ran](const RecordFunction& fn) { ran = true; return nullptr; }) .setShouldRun(shouldRunCallback)); { RECORD_USER_SCOPE("test"); } @@ -1111,20 +1080,13 @@ TEST(RecordFunctionTest, ShouldRun) { TORCH_CHECK(ran); clearCallbacks(); -} - -TEST(RecordFunctionTest, Basic) { - // disabling the inlining of method calls - GraphOptimizerEnabledGuard opt_guard(false); - - static std::string recorded_op; - static bool has_ids = false; // test propagation of TLS callbacks std::thread t([]() { RecordFunctionGuard enable_rec_fn; + std::string recorded_op; auto handle = addThreadLocalCallback(RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr { + [&recorded_op](const RecordFunction& fn) { recorded_op = fn.name().str(); return nullptr; })); @@ -1134,16 +1096,17 @@ TEST(RecordFunctionTest, Basic) { RECORD_USER_SCOPE("test_in_thread"); }); t_child.join(); - EXPECT_EQ(recorded_op, "test_in_thread"); + TORCH_CHECK(recorded_op == "test_in_thread"); removeCallback(handle); }); t.join(); clearCallbacks(); // test set ids + bool has_ids = false; addGlobalCallback( RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr { + [&has_ids](const RecordFunction& fn) { has_ids = fn.handle() > 0; return nullptr; }) @@ -1153,7 +1116,7 @@ TEST(RecordFunctionTest, Basic) { clearCallbacks(); has_ids = false; addGlobalCallback(RecordFunctionCallback( - [](const RecordFunction& fn) -> std::unique_ptr { + [&has_ids](const RecordFunction& fn) { has_ids = fn.handle() > 0; return nullptr; })); @@ -1163,9 +1126,10 @@ TEST(RecordFunctionTest, Basic) { } TEST(RecordFunctionTest, OperatorNameOverload) { - static std::set operator_names; + std::set operator_names; + at::addGlobalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) -> std::unique_ptr { + [&operator_names](const at::RecordFunction& fn) { c10::optional op_name = fn.operator_name(); if (op_name.has_value()) { @@ -1214,8 +1178,6 @@ void checkDebugInfo(c10::DebugInfoKind kind, int model_id) { } TEST(ThreadLocalDebugInfoTest, Basic) { - static std::atomic done{false}; - TORCH_CHECK( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); auto debug_info = std::make_shared(); @@ -1228,9 +1190,10 @@ TEST(ThreadLocalDebugInfoTest, Basic) { // check that thread local debug info is propagated through fork calls TORCH_CHECK( c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); + std::atomic done{false}; { c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info); - at::launch([]() { + at::launch([&done]() { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); done = true; }); @@ -1243,7 +1206,7 @@ TEST(ThreadLocalDebugInfoTest, Basic) { c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr); done = false; auto handle = addGlobalCallback(RecordFunctionCallback( - [](const RecordFunction&) -> std::unique_ptr { + [&done](const RecordFunction&) { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); done = true; return nullptr; @@ -1273,7 +1236,7 @@ TEST(ThreadLocalDebugInfoTest, Basic) { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); checkDebugInfo(c10::DebugInfoKind::TEST_INFO_2, 314); done = false; - at::launch([]() { + at::launch([&done]() { checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42); checkDebugInfo(c10::DebugInfoKind::TEST_INFO_2, 314); done = true; diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 7bf11a4d6316..488b7be9bd8a 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -172,7 +172,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { at::enableRecordFunction(enable); }); m.def("_set_empty_test_observer", [](bool is_global, double sampling_prob) { - auto cb = at::RecordFunctionCallback(nullptr) + auto cb = at::RecordFunctionCallback( + [](const at::RecordFunction&) { return nullptr; }, + [](const at::RecordFunction&, at::ObserverContext*) {}) .needsInputs(true) .samplingProb(sampling_prob); if (is_global) { diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index d478aa509822..eb52aec8920d 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -414,7 +414,7 @@ void pushProfilingCallbacksLegacy() { auto state_ptr = getProfilerTLSState(); TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) -> std::unique_ptr{ + [](const at::RecordFunction& fn) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return nullptr; From 39a10fb6528b355791993a67b895934e3c902765 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Mon, 14 Dec 2020 22:08:03 -0800 Subject: [PATCH 248/250] Fix check_kernel_launches.py for macros and provide extended context (#49365) Summary: `check_kernel_launches.py` currently gives a false positive in instances such as: ``` 735: <<>>( \ 736: outInfo, selfInfo, indicesInfo, \ 737: outSelectDim, selfSelectDim, static_cast(sliceSize), \ 738: selfSelectDimSize); \ 739: C10_CUDA_KERNEL_LAUNCH_CHECK(); ``` because the newlines after the last `\` are not consumed by the regex. This fixes that. In addition, the regex is modified to provide greater context for the start of the kernel launch. This changes the context from: ``` 157: ( 158: size, X_strides, Y_dims, X, Y); ``` to ``` 157: <<cuda_stream()>>>( 158: size, X_strides, Y_dims, X, Y); ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/49365 Test Plan: ``` buck test //caffe2/test:kernel_launch_checks -- --print-passing-details ``` Reviewed By: aakshintala Differential Revision: D25545402 Pulled By: r-barnes fbshipit-source-id: 76feac6a002187239853752b892f4517722a77bf --- test/test_kernel_launch_checks.py | 11 +++++++++-- torch/testing/check_kernel_launches.py | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/test/test_kernel_launch_checks.py b/test/test_kernel_launch_checks.py index 079a7182a1fc..698a5cda2a42 100644 --- a/test/test_kernel_launch_checks.py +++ b/test/test_kernel_launch_checks.py @@ -26,9 +26,16 @@ def test_check_code(self): """)) # Does it work for macros? - self.assertEqual(0, check_code_for_cuda_kernel_launches(""" -#define SOME_MACRO(x) some_function_call<<<1,2>>> ( x ) ; \\ + self.assertEqual(0, check_code_for_cuda_kernel_launches(r""" +#define SOME_MACRO(x) some_function_call<<<1,2>>> ( x ) ; \ C10_CUDA_KERNEL_LAUNCH_CHECK(); + +#define SMALL_INDEX(TENSOR_TYPE, INDICES_TYPE, TYPE, SELF_DIM, SOURCE_DIM, IDX_DIM) \ + indexAddSmallIndex \ + <<>>( \ + selfInfo, sourceInfo, indexInfo, \ + selfAddDim, sourceAddDim, sliceSize, selfAddDimSize); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); """)) def test_check_cuda_launches(self): diff --git a/torch/testing/check_kernel_launches.py b/torch/testing/check_kernel_launches.py index 091f1be98561..c274316b54fe 100644 --- a/torch/testing/check_kernel_launches.py +++ b/torch/testing/check_kernel_launches.py @@ -18,12 +18,13 @@ # But this should be sufficient to detect and fix most problem # instances and can be refined before the test is made binding kernel_launch_regex = re.compile(r""" - >>> # Identifies kernel launch + ^.*>>> # Identifies kernel launch \s* # Maybe some whitespace (includes newlines) \([^;]+\); # And then arguments in parens and semi-colon (?! # Negative lookahead: we trigger if we don't find the launch guard \s* # Maybe some whitespace (includes newlines) \\? # 0 or 1 backslashes (for launches in preprocessor macros) + \s* # Maybe some whitespace (includes newlines) (?:[0-9]+: )? # Detects and ignores a line numbering, if present \s* # Maybe some whitespace (includes newlines) C10_CUDA_KERNEL_LAUNCH_CHECK\(\); # Kernel launch guard! From 98726119d95b156d083d857ab1458465eb172147 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 15 Dec 2020 00:02:20 -0800 Subject: [PATCH 249/250] Do not return unitialized qschame from getQSchemeAndQParamVector (#49391) Summary: Assign it by default to `kPerTensorAffine` Fixes regressions accidentally discovered by https://app.circleci.com/pipelines/github/pytorch/pytorch/250370/workflows/6f38ae43-a9a5-43f3-8c1f-0f911df69d75/jobs/9589799 Pull Request resolved: https://github.com/pytorch/pytorch/pull/49391 Reviewed By: ngimel Differential Revision: D25554180 Pulled By: malfet fbshipit-source-id: f42a45e9d6743c665c62d057197d009f1542226e --- torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp index aaaaf6185dde..0d2c1c20b555 100644 --- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp +++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp @@ -987,7 +987,7 @@ std::tuple InsertQuantDeQuantHelper:: v->debugName(), " exists."); QParamVector qparams; - c10::QScheme qscheme; + c10::QScheme qscheme = c10::kPerTensorAffine; auto observer_module = module.attr(observer_name.value()).toModule(); auto scalar_type = observer_module.attr("dtype"); From 5a5e576ab9fbcf1bbe89637a026e46d33f60ef7b Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Tue, 15 Dec 2020 00:46:02 -0800 Subject: [PATCH 250/250] Update TensorPipe submodule (#49232) Summary: Credit to beauby for the Bazel fixes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/49232 Test Plan: Export and run on CI Reviewed By: beauby Differential Revision: D25494735 fbshipit-source-id: 3d6f326ca49dcd28d0d19cb561818c3c2904cb55 --- third_party/tensorpipe | 2 +- third_party/tensorpipe.BUILD | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/third_party/tensorpipe b/third_party/tensorpipe index 82a114882e21..5381c57ba923 160000 --- a/third_party/tensorpipe +++ b/third_party/tensorpipe @@ -1 +1 @@ -Subproject commit 82a114882e21b176916e2f12a7b566af3d63df71 +Subproject commit 5381c57ba923481ffaf7c40f9acc7f164ded887f diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD index 66c7b1c7a1ab..45b99e64ec9a 100644 --- a/third_party/tensorpipe.BUILD +++ b/third_party/tensorpipe.BUILD @@ -93,7 +93,13 @@ TENSORPIPE_HEADERS = glob([ TENSORPIPE_BASE_SRCS = glob([ "tensorpipe/*.cc", "tensorpipe/channel/*.cc", - "tensorpipe/common/*.cc", + "tensorpipe/common/address.cc", + "tensorpipe/common/epoll_loop.cc", + "tensorpipe/common/error.cc", + "tensorpipe/common/fd.cc", + "tensorpipe/common/ibv.cc", + "tensorpipe/common/socket.cc", + "tensorpipe/common/system.cc", "tensorpipe/core/*.cc", "tensorpipe/transport/*.cc", "tensorpipe/util/*/*.cc", @@ -107,7 +113,10 @@ TENSORPIPE_SRCS = TENSORPIPE_BASE_SRCS + glob([ ]) TENSORPIPE_SRCS_CUDA = TENSORPIPE_SRCS + glob([ + "tensorpipe/common/cuda_loop.cc", + "tensorpipe/channel/cuda_basic/*.cc", "tensorpipe/channel/cuda_ipc/*.cc", + "tensorpipe/channel/cuda_xth/*.cc", ]) cc_library(